1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
|
#! /usr/bin/env python
# Twitter scraping script. Takes a line-separated list of twitter usernames
# and pulls down an arbitrary number of tweets from each profile.
#
# To use this script, you will need:
# - Tweepy
# - a newline delimited list of usernames (username, e.g. as_te_li)
# - a file called 'keys.py' containing the variable assignments
# detailed below (application codes and account linking codes)
#
# The Twitter API is rate limited, therefore trying to fetch statuses from
# over 350 users within an hour will cause twitter to block for an hour.
#
# Usage: scan_users.py <username list> <tweet dump target path> <input seek-to line>
#
# All arguments are optional.
#
import sys
import signal
import tweepy
import time
import os
# keeping the access key/access secret in a separate file
# so that each person running this script can easily use their
# own access keys generated by link_acct.py
# Your 'keys.py' file should look like this
#ACCESS_KEY = '284417887-quUxFooBarrandomstuffgoeshere9001'
#ACCESS_SECRET = 'moReRand0ml0ok1ngstUfF1nh3re838hhsh36'
#CONSUMER_KEY = 'mJ6vPoSPrseFBsE2noKg'
#CONSUMER_SECRET = 'oFxu6wKrWVLRvy2kHzVvPKpEhT9XuSrz6bcmcluDbw'
import keys
############################# CONTROL SIG SETUP ##############################
# Catch SIGINT. sys.exit() would get ignored by this script,
# possibly having something to do with HTTP libraries' exception
# handling, so os._exit() is used instead.
def signal_handler(signal, frame):
print "\nSIGINT caught, exiting. (PID: %d)" % (os.getpid())
username_file_desc.close()
if ( user_output_file != 0 ):
tweet_output_file.close()
os._exit(0)
signal.signal(signal.SIGINT, signal_handler)
############################# ARGUMENT PARSING ##############################
arg_count = 0
for arg in sys.argv:
arg_count = arg_count + 1
# protip for the user if they get it wrong
if (arg_count < 1 or arg_count > 4):
print "\nUsage: %s <username_file (opt)> <tweet_dump_file (opt)> <skip#(opt)>" %(sys.argv[0])
os._exit(1)
# set username input file
if (arg_count > 1):
username_file = sys.argv[1].strip()
else:
username_file = "usernames.txt"
# set tweet output file
if (arg_count > 2):
user_output_file = str(sys.argv[2].strip())
else:
user_output_file = 0
# set lines in input file to skip (ghetto line seek)
if (arg_count > 3):
skip_to_input = sys.argv[3].strip()
try:
skip_to = int(skip_to_input)
except:
print "Seek-to-line value supplied is not a number. Defaulting to 0."
skip_to = 0
else:
skip_to = 0
cur_line_no = 1
print "Using: username_file='%s' output_file='%s' seek-to-line=%s" % (username_file, user_output_file, skip_to)
############################# TWITTER AUTH #################################
# Handshaking with Twitter using the account whose access keys were provided
auth = tweepy.OAuthHandler(keys.CONSUMER_KEY, keys.CONSUMER_SECRET)
auth.set_access_token(keys.ACCESS_KEY, keys.ACCESS_SECRET)
api = tweepy.API(auth)
############################# FILE I/O SETUP ###############################
#set up output file if requested
if ( user_output_file != 0 ):
try:
# to write unicode to disk, we're opening the target in binary mode.
# we convert the text to utf-8 before writing it to the target file.
tweet_output_file = open(user_output_file, mode='wb')
except:
print "Could not open specified output file to write, exiting."
os._exit(1)
# attempt to set up to read from the username input file
try: # username file descriptor
username_file_desc = open(username_file, 'r')
except:
print "Could not open username file, exiting."
os._exit(1)
############################# TWEET FETCHING ###############################
# the actual scanning of each user's status' happens here.
for scan_target in username_file_desc:
# this test here allows us to skip [skip_to] lines of input file.
if( cur_line_no >= skip_to ):
username = scan_target.strip()
try:
# pulls down [count] tweets,
user_statuses = api.user_timeline(username, count=1)
# loop through each tweet entity data. there's a decent amount of misc. metadata.
# location, time zone etc. can be pulled out here. 'status.[metadata_type]'
for status in user_statuses:
cur_tweet = "date=[%s] username=[%s] text=[%s]" %(status.created_at, username, status.text)
# write to specified output file, else throw error.
if ( user_output_file != 0 ):
try:
# we have to add a newline, as .encode() doesn't handle those.
cur_tweet_utf8 = cur_tweet.encode('utf-8') + "\n"
errcode = tweet_output_file.write(cur_tweet_utf8)
except:
print "Could not write output file."
print cur_tweet
except:
print "Oops, %s's posts are unavailable." % (username)
# this value was chosen to correspond to a maximum request rate of 350/hour
# if 350 requests happen, twitter stops responding. Hence 11 second delay.
# time.sleep(11.0)
# update line cursor
cur_line_no = cur_line_no + 1
############################# END OF PROGRAM ############################
# apparently if you don't close out
# your file descriptors, funny things
# can happen, like your write operations
# mysteriously not writing anything.
username_file_desc.close()
if ( user_output_file != 0 ):
tweet_output_file.close()
os._exit(0)
|