Loading [MathJax]/extensions/TeX/AMSmath.js

mathjax

Friday, October 12, 2012

Where did my Tweet go ? Tracking retweets through the social graph.


D3 visualization of a tweet propagation for @OReillyMedia


Everybody tweets these days but not all tweets are created equal, and  neither are tweeters for that matter. One of the truest measures of a tweet and, by proxy, the tweeter, is how far into the social-graph the message propagates via retweeting. Retweeting is an explicit indication of engagement on the part of the retweeter and in most cases, an endorsement of the message and the original author. That all said, it is an interesting exercise to see where in the social-graph a particular tweet reached. The Twitter API makes it easy to tell how many times and by whom a message was retweeted but it takes a bit more legwork to determine the path taken to the recipients.

A simple method to follow the propagation of a tweet is to do a breadth-first traversal of followers links, starting at the message author, until all retweeters have been accounted for. Obviously there are some assumptions wrapped up in this methodology but for the most part evidence supports the results. The Python script below performs this walk through the social graph. For economy against the Twitter API, the script caches follower lists in a Redis server so that they may be re-used for subsequent runs. This scheme works best when examining tweets which are closely related and incorporate many of the same Twitter users.

For visualization purposes, the Python script outputs a JSON file for consumption by a D3 force-directed graph template. D3 expects nodes and links enumerated in separate lists, the link elements making reference to the node elements via node list indices. A sample graph is shown above, visualizing  the path of a tweet from @OReillyMedia. Twitter users are indicated by their avatars and a grey circle with radius proportional to the logarithm of the number of followers. The originator of the message is indicated with a red circle. The graph title gives the text of the tweet, the overall retweet count, and the number of users reached by the message (sum of everyone's followers).

While the ability to gather broad insight with this method is limited by Twitter API rate controls, it could be used to do a focused study on a specific Twitter user, looking for prominent social-graph pathways and individuals that warrant reciprocation. Failing that, the D3 transitions as the graph builds and stabilizes makes fascinating viewing.


#!/usr/bin/python -u
#
# Usage: ./trace.py <tweetId>
#
import sys
import tweepy
import Queue
import time
import json
import redis
CONSUMER_KEY = 'xxx'
CONSUMER_SECRET = 'xxx'
ACCESS_KEY = 'xxx'
ACCESS_SECRET = 'xxx'
REDIS_FOLLOWERS_KEY = "followers:%s"
# Retweeter who have not yet been connected to the social graph
unconnected = {}
# Retweeters connected to the social graph...become seeds for deeper search
connected = Queue.Queue()
# Social graph
links = []
nodes = []
#----------------------------------------
def addUserToSocialGraph (parent, child):
# parent: tweepy.models.User
# child: tweepy.models.User
#----------------------------------------
global links;
if (child):
nodes.append ({'id':child.id,
'screen_name':child.screen_name,
'followers_count':child.followers_count,
'profile_image_url':child.profile_image_url})
# TODO: Find child and parent indices in nodes in order to create the links
if (parent):
print (nodes)
print ("Adding to socialgraph: %s ==> %s" % (parent.screen_name, child.screen_name))
links.append ({'source':getNodeIndex (parent),
'target':getNodeIndex (child)})
#----------------------------------------
def getNodeIndex (user):
# node: tweepy.models.User
#----------------------------------------
global nodes
for i in range(len(nodes)):
if (user.id == nodes[i]["id"]):
return i
return -1
#----------------------------------------
def isFollower (parent, child):
# parent: tweepy.models.User
# child: tweepy.models.User
#----------------------------------------
global red
# Fetch data from Twitter if we dont have it
key = REDIS_FOLLOWERS_KEY % parent.screen_name
if ( not red.exists (key) ):
print ("No follower data for user %s" % parent.screen_name)
crawlFollowers (parent)
cache_count = red.hlen (key)
if ( parent.followers_count > (cache_count*1.1) ):
print ("Incomplete follower data for user %s. Have %d followers but should have %d (exceeds 10% margin for error)."
% (parent.screen_name, cache_count, parent.followers_count))
crawlFollowers (parent)
return red.hexists (key, child.screen_name)
#----------------------------------------
def crawlFollowers (user):
# user: tweepy.models.User
#----------------------------------------
print ("Retrieving followers for %s (%d)" % (user.screen_name, user.followers_count))
count = 0
follower_cursors = tweepy.Cursor (api.followers, id = user.id)
followers_iter = follower_cursors.items()
follower = None
while True:
try:
# We may have to retry a failed follower lookup
if ( follower is None ):
follower = followers_iter.next()
# Add link to Redis
red.hset ("followers:%s" % user.screen_name, follower.screen_name, follower.followers_count)
follower = None
count += 1
except StopIteration:
break
except tweepy.error.TweepError as (err):
print ("Caught TweepError: %s" % (err))
if (err.reason == "Not authorized" ):
print ("Not authorized to see users followers. Skipping.")
break
limit = api.rate_limit_status()
if (limit['remaining_hits'] == 0):
seconds_until_reset = int (limit['reset_time_in_seconds'] - time.time())
print ("API request limit reached. Sleeping for %s seconds" % seconds_until_reset)
time.sleep (seconds_until_reset + 5)
else:
print ("Sleeping a few seconds and then retrying")
time.sleep (5)
print ("Added %d followers of user %s" % (count, user.screen_name))
#----------------------------------------
# Main
#----------------------------------------
tweetId = sys.argv[1]
# Connect to Redis
red = redis.Redis(unix_socket_path="/tmp/redis.sock")
# Connect to Twitter
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = tweepy.API(auth)
print (api.rate_limit_status())
# Get original Tweet details
status = api.get_status (tweetId)
connected.put(status.user)
addUserToSocialGraph (None, status.user)
retweets = api.retweets (status.id)
print ("Tweet %s, originally posted by %s, was retweeted by..." % (status.id, status.user.screen_name))
for retweet in retweets:
print (retweet.user.screen_name)
unconnected[retweet.user.screen_name] = retweet.user;
# Pivot
while not (connected.empty() or len(unconnected)==0):
# Get next user
pivot = connected.get()
# Check followers of this user against unconnected retweeters
print ("Looking through followers of %s" % pivot.screen_name)
for (screen_name, retweeter) in unconnected.items():
if (isFollower(pivot, retweeter)):
print ("%s <=== %s" % (pivot.screen_name, retweeter.screen_name))
connected.put (retweeter)
addUserToSocialGraph (pivot, retweeter)
del unconnected[retweeter.screen_name]
else:
print ("%s <=X= %s" % (pivot.screen_name, retweeter.screen_name))
# Add unconnected nodes to social graph
for (screen_name, user) in unconnected.items():
addUserToSocialGraph (None, user)
# Encode data as JSON
filename = "%s.json" % status.id
print ("\n\nWriting JSON to %s" % filename)
tweet = {'id':status.id,
'retweet_count':status.retweet_count,
'text':status.text,
'author':status.user.id}
f = open (filename, 'w')
f.write (json.dumps({'tweet':tweet, 'nodes':nodes, 'links':links}, indent=2))
f.close
sys.exit()
view raw trace.py hosted with ❤ by GitHub

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<script type="text/javascript" src="d3.js"></script>
<style>
.link {
stroke: #ccc;
stroke-width: 2
}
.node text {
pointer-events: none;
font: 10px sans-serif;
}
.originator {
fill: none;
stroke: red;
stroke-width: 2
}
.followers {
fill: none;
stroke: #ccc;
stroke-width: 2
}
.tweet-text {
pointer-events: none;
font: 20px sans-serif;
fill: black
}
.tweet-retweetcount {
pointer-events: none;
font: 15px sans-serif;
fill: #ccc
}
.tweet-reach {
pointer-events: none;
font: 15px sans-serif;
fill: #ccc
}
</style>
</head>
<body>
<script>
function getUrlVars() {
var vars = {};
var parts = window.location.href.replace(/[?&]+([^=&]+)=([^&]*)/gi, function(m,key,value) {
vars[key] = value;
});
return vars;
}
reach = null
tweetId = getUrlVars()["id"]
console.log(tweetId)
d3.json("data/" + tweetId + ".json", function(json){
var width = 960,
height = 960;
var svg = d3.select('body')
.append('svg')
.attr('width', width)
.attr('height', height);
// draw the graph edges
var link = svg.selectAll("link")
.data(json.links)
.enter()
.append("line")
.attr("class","link");
// draw the graph nodes
var node = svg.selectAll("node")
.data(json.nodes)
.enter()
.append("g")
.attr("class", "node")
// create the layout
var force = d3.layout.force()
.gravity(0.1)
.linkDistance(200)
.charge(-800)
.size([width, height])
.nodes(json.nodes)
.links(json.links)
.start();
// define what to do one each tick of the animation
force.on("tick", function() {
link.attr("x1", function(d) { return d.source.x; })
.attr("y1", function(d) { return d.source.y; })
.attr("x2", function(d) { return d.target.x; })
.attr("y2", function(d) { return d.target.y; });
node.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; });
});
// bind the drag interaction to the nodes
node.call(force.drag);
// Node images
node.append("image")
.attr("xlink:href", function(d) { return d.profile_image_url })
.attr("x", -20)
.attr("y", -20)
.attr("width", 40)
.attr("height", 40);
// Label nodes
node.append("text")
.attr("dx", 20)
.attr("dy", ".35em")
.text(function(d) { return d.screen_name });
// Follower count circles
node.append("circle")
.attr("class", "followers")
.attr("r", function(d) { reach += d.followers_count; return 5*Math.log(d.followers_count+2)})
// Mark originator node
d3.select (".node").append("circle").attr("class", "originator").attr("r", 20)
// Tweet details
svg.append("text")
.attr("class", "tweet-text")
.attr("x", 10)
.attr("y", 20)
.text(json.tweet.text)
svg.append("text")
.attr("class", "tweet-retweetcount")
.attr("x", 10)
.attr("y", 40)
.text("Retweet Count: " + json.tweet.retweet_count)
svg.append("text")
.attr("class", "tweet-reach")
.attr("x", 10)
.attr("y", 60)
.text("Reach: " + reach)
});
</script>
</body>
</html>
view raw retweet.html hosted with ❤ by GitHub

No comments:

Post a Comment