Convert text file of dictionaries to csv Python

Question

I have a text file composed by different dictionaries and it looks like this:

{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas@foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"}
{"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7@foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}

Each line is a dictionary and some dictionaries have more keys than others, and I would like to convert the text file to a csv file.

I have the following code:

import json
import csv
import ast

def json_to_csv(txt_file, csv_file):

    lista = []
    with open(txt_file, 'rb') as fin:
        lines = fin.readlines()
        for line in lines:
            dict_line = ast.literal_eval(line)
            lista.append(line)
    list_json = json.dumps(lista)

    read_json = json.loads(list_json)

    header =["feed.accuracy","feed.url","source.geolocation.longitude","event_description.text","raw","destination.geolocation.city","source.ip","classification.taxonomy",
                "time.observation","destination.geolocation.latitude","destination.ip","source.asn","feed.name","source.geolocation.latitude","time.source","feed.provider",
                "destination.geolocation.longitude","destination.geolocation.cc","destination.asn","source.abuse_contact","source.geolocation.cc","classification.type"]
    with open(csv_file, 'wb+') as f:
        dict_writer = csv.DictWriter(f, header)
        dict_writer.writeheader()
        dict_writer.writerows(read_json)

First I read the text file, then I convert its content into JSON and then I try to write the converted data into the csv file, however its returning the following error:

Traceback (most recent call last):
  File "", line 1, in 
    json_to_csv('ctt.txt','ctt.csv')
  File "C:/Users/Marisa/Documents/json_to_csv.py", line 26, in json_to_csv
    dict_writer.writerows(read_json)
  File "C:\Python27\lib\csv.py", line 157, in writerows
    rows.append(self._dict_to_list(rowdict))
  File "C:\Python27\lib\csv.py", line 148, in _dict_to_list
    + ", ".join([repr(x) for x in wrong_fields]))
ValueError: dict contains fields not in fieldnames: u'{', u'"', u'f', u'e', u'e', u'd', u'.', u'a', u'c', u'c', u'u', u'r', u'a', u'c', u'y', u'"', u':', u' ', u'1', u'0', u'0', u'.', u'0', u',', u' ', u'"', u'c', u'l', u'a', u's', u's', u'i', u'f', u'i', u'c', u'a', u't', u'i', u'o', u'n', u'.', u't', u'a', u'x',...

Feneric · Accepted Answer

You're making it a little more complicated than it needs to be, and you're missing some of the fields in your own example data above. We can get rid of the ast dependency and the back & forth JSON processing, add in the missing fields, and the following will work with the sample data you've provided:

import json
import csv

def json_to_csv(txt_file, csv_file):

    lista = []
    with open(txt_file, 'r') as in_file:
        lines = in_file.readlines()
        for line in lines:
            try:
                dict_line = json.loads(line)
                lista.append(dict_line)
            except Exception as err:
                print(err)

    header = [
        "feed.accuracy", "feed.url", "source.geolocation.longitude",
        "event_description.text", "raw", "destination.geolocation.city",
        "source.ip", "classification.taxonomy", "time.observation",
        "destination.geolocation.latitude", "destination.ip", "source.asn",
        "feed.name", "source.geolocation.latitude", "time.source",
        "feed.provider", "destination.geolocation.longitude",
        "destination.geolocation.cc", "destination.asn",
        "source.abuse_contact", "source.geolocation.cc", "classification.type",
        'destination.fqdn', 'source.fqdn', 'source.geolocation.city',
        'source.url', 'destination.url'
    ]
    with open(csv_file, 'w+') as out_file:
        dict_writer = csv.DictWriter(out_file, header)
        dict_writer.writeheader()
        dict_writer.writerows(lista)

Note that if your real data has more fields that aren't included in your sample, you'll need to add those, too.

Note too that if your input data were a proper JSON array like:

[{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas@foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"},
{"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7@foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}]

the solution simplifies quite a bit more with the whole initial with open block becoming just:

with open(txt_file, 'r') as in_file:
    lista = json.load(in_file)

Convert text file of dictionaries to csv Python

Answers (1)

Related Questions