Reputation: 1220
When a single hash is found in the tweet URL , The script inserts the values into MYSQL
DB properly . When there is 2 or more hash found in the tweet URL , The records are inserted
twice in the MYSQL DB.
For example , If a tweet has 2 URLS with hashes mentioned , In MYSQL DB 4 records are created.
DB State:
"https://www.virustotal.com/en/file/2819e520dea611c4dd1c3b1fd54adbd0c50963ff75d67cc7facbe2090574afc0/analysis/","2017-09-20 01:00:35","2819e520dea611c4dd1c3b1fd54adbd0c50963ff75d67cc7facbe2090574afc0"
"https://www.virustotal.com/en/file/8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89/analysis/","2017-09-20 01:03:35","8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89"
"https://www.virustotal.com/en/file/b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0/analysis/","2017-09-20 01:03:35","8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89"
"https://www.virustotal.com/en/file/8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89/analysis/","2017-09-20 01:03:35","b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0"
"https://www.virustotal.com/en/file/b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0/analysis/","2017-09-20 01:03:35","b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0"
Any suggestions on how to insert only single entries to DB ?
#! /usr/bin/python
from __future__ import print_function
import tweepy
import json
import MySQLdb
import time
import json, urllib, urllib2, argparse, hashlib, re, sys
from dateutil import parser
WORDS = ['virustotal']
CONSUMER_KEY = "XXXX"
CONSUMER_SECRET = "YYY"
ACCESS_TOKEN = "AAAA"
ACCESS_TOKEN_SECRET = "DDDDD"
HOST = "192.168.150.1"
USER = "admin"
PASSWD = "admin"
DATABASE = "twitter"
def store_data(values, insert_time, insert_hash):
db=MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DATABASE, charset="utf8")
cursor = db.cursor()
data = []
#print(hashes)
for value in values:
data.append((value, insert_time, insert_hash))
cursor.executemany("""INSERT INTO tweet_url VALUES (%s,%s,%s)""",data)
db.commit()
cursor.close()
db.close()
return
class StreamListener(tweepy.StreamListener):
def on_connect(self):
print("We are now connected to the streaming API.")
def on_error(self, status_code):
print('An Error has occured: ' + repr(status_code))
return False
def on_data(self, data):
try:
datajson = json.loads(data)
web_url= datajson['entities']['urls']
#print(web_url)
urls=[]
for i in web_url:
urls.append((i['expanded_url']))
values = [list([item]) for item in urls]
list_url = ','.join([str(i) for i in values])
extract_url=str(list_url)
formatted_url=''.join(extract_url)
sha256_hash=re.findall(r"([a-fA-F\d]{64})", formatted_url)
hashes=''.join(sha256_hash)
insert_time=time.strftime('%Y-%m-%d %H:%M:%S')
hash_list=re.findall(r"([a-fA-F\d]{64})", hashes)
for insert_hash in hash_list:
store_data(values, insert_time, insert_hash)
print(store_data)
print(hashes)
print(type(hashes))
except Exception as e:
print(e)
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
listener = StreamListener(api=tweepy.API(wait_on_rate_limit=True))
streamer = tweepy.Stream(auth=auth, listener=listener)
print("Tracking: " + str(WORDS))
streamer.filter(track=WORDS)
Upvotes: 1
Views: 829
Reputation: 13327
You have a first loop :
for insert_hash in hash_list:
store_data(values, insert_time, insert_hash)
And then you loop again on the values to build the data list of tuples :
for value in values:
data.append((value, insert_time, insert_hash))
So the values are called twice.
Maybe you could use zip()
or enumerate()
to join the hash_list
and the values
before calling store_data
?
data = []
if len(values) == len(hash_list):
for val,hash in zip(values, hash_list):
data.append((val, insert_time, hash))
store_data(data)
And then, no need to loop again inside store_data()
, just change the signature to pass directly the data list:
def store_data(data_list):
# connection to database
cursor.executemany("""INSERT INTO tweet_url VALUES (%s,%s,%s)""",data_list)
Upvotes: 2