Reputation: 331
I have a text file composed by different dictionaries and it looks like this:
{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "[email protected]", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"}
{"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "[email protected]", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}
Each line is a dictionary and some dictionaries have more keys than others, and I would like to convert the text file to a csv file.
I have the following code:
import json
import csv
import ast
def json_to_csv(txt_file, csv_file):
lista = []
with open(txt_file, 'rb') as fin:
lines = fin.readlines()
for line in lines:
dict_line = ast.literal_eval(line)
lista.append(line)
list_json = json.dumps(lista)
read_json = json.loads(list_json)
header =["feed.accuracy","feed.url","source.geolocation.longitude","event_description.text","raw","destination.geolocation.city","source.ip","classification.taxonomy",
"time.observation","destination.geolocation.latitude","destination.ip","source.asn","feed.name","source.geolocation.latitude","time.source","feed.provider",
"destination.geolocation.longitude","destination.geolocation.cc","destination.asn","source.abuse_contact","source.geolocation.cc","classification.type"]
with open(csv_file, 'wb+') as f:
dict_writer = csv.DictWriter(f, header)
dict_writer.writeheader()
dict_writer.writerows(read_json)
First I read the text file, then I convert its content into JSON and then I try to write the converted data into the csv file, however its returning the following error:
Traceback (most recent call last):
File "<pyshell#38>", line 1, in <module>
json_to_csv('ctt.txt','ctt.csv')
File "C:/Users/Marisa/Documents/json_to_csv.py", line 26, in json_to_csv
dict_writer.writerows(read_json)
File "C:\Python27\lib\csv.py", line 157, in writerows
rows.append(self._dict_to_list(rowdict))
File "C:\Python27\lib\csv.py", line 148, in _dict_to_list
+ ", ".join([repr(x) for x in wrong_fields]))
ValueError: dict contains fields not in fieldnames: u'{', u'"', u'f', u'e', u'e', u'd', u'.', u'a', u'c', u'c', u'u', u'r', u'a', u'c', u'y', u'"', u':', u' ', u'1', u'0', u'0', u'.', u'0', u',', u' ', u'"', u'c', u'l', u'a', u's', u's', u'i', u'f', u'i', u'c', u'a', u't', u'i', u'o', u'n', u'.', u't', u'a', u'x',...
Upvotes: 1
Views: 750
Reputation: 861
You're making it a little more complicated than it needs to be, and you're missing some of the fields in your own example data above. We can get rid of the ast
dependency and the back & forth JSON processing, add in the missing fields, and the following will work with the sample data you've provided:
import json
import csv
def json_to_csv(txt_file, csv_file):
lista = []
with open(txt_file, 'r') as in_file:
lines = in_file.readlines()
for line in lines:
try:
dict_line = json.loads(line)
lista.append(dict_line)
except Exception as err:
print(err)
header = [
"feed.accuracy", "feed.url", "source.geolocation.longitude",
"event_description.text", "raw", "destination.geolocation.city",
"source.ip", "classification.taxonomy", "time.observation",
"destination.geolocation.latitude", "destination.ip", "source.asn",
"feed.name", "source.geolocation.latitude", "time.source",
"feed.provider", "destination.geolocation.longitude",
"destination.geolocation.cc", "destination.asn",
"source.abuse_contact", "source.geolocation.cc", "classification.type",
'destination.fqdn', 'source.fqdn', 'source.geolocation.city',
'source.url', 'destination.url'
]
with open(csv_file, 'w+') as out_file:
dict_writer = csv.DictWriter(out_file, header)
dict_writer.writeheader()
dict_writer.writerows(lista)
Note that if your real data has more fields that aren't included in your sample, you'll need to add those, too.
Note too that if your input data were a proper JSON array like:
[{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "[email protected]", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"},
{"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "[email protected]", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}]
the solution simplifies quite a bit more with the whole initial with open
block becoming just:
with open(txt_file, 'r') as in_file:
lista = json.load(in_file)
Upvotes: 2