scraper34863
scraper34863

Reputation: 1

How to convert JSON with nested elements to csv in Python?

I try to convert a huge file (~65 GB) into smaller subsets. The goal is to split the files into the smaller subsets e.g. 1 mil tweets per file and convert this data to a csv format. I currently have a working splitting code that splits the ndjson file to smaller ndjson files, but I have trouble to convert the data to csv. The important part is to create columns for each exsisting variable, so columns named __crawled_url or w1_balanced. There are quite a few nested variabels in the data, like w1_balanced is contained in the variable theme_topic, that need to be flattened.

Here is the working split that I want to merge with the csv conversion:

import json
#function to split big ndjson file to multiple smaller files
def split_file(input_file, lines_per_file): #variables that the function calls
    file_count = 0
    line_count = 0
    output_lines = []
    with open(input_file, 'r', encoding="utf8") as infile:
        for line in infile:
            output_lines.append(line)
            line_count += 1
            if line_count == lines_per_file:
                with open(f'1mio_split_{file_count}.ndjson', 'w', encoding="utf8") as outfile:
                    outfile.writelines(output_lines)
                file_count += 1
                line_count = 0
                output_lines = []
        #handle any remaining lines
        if output_lines:
            with open(f'1mio_split_{file_count}.ndjson', 'w',encoding="utf8") as outfile:
                outfile.writelines(output_lines)
#file containing tweets
input_file = input("path to big file:" )
#example filepath: C:/Users/YourName/Documents/tweet.ndjson
#how many lines/tweets should the new file contain?
lines_per_file = int(input ("Split after how many lines?: "))
split_file(input_file, lines_per_file)
print("Splitting done!")

Here are 2 sample lines from the data I use:

[{"__crawled_url":"https://twitter.com/example1","theme_topic":{"w1_balanced":{"label":"__label__a","confidence":0.3981},"w5_balanced":{"label":"__label__c","confidence":1}},"author":"author1","author_userid":"116718988","author_username":"author1","canonical_url":"https://twitter.com/example1","collected_by":"User","collection_method":"tweety 1.0.9.4","collection_time":"2024-05-27T14:40:32","collection_time_epoch":1716813632,"isquoted":false,"isreply":true,"isretweet":false,"language":"de","mentioning/replying":"twitteruser","num_likes":"0","num_retweets":"0","plain_text":"@twitteruser here is an exmaple text 🤔","published_time":"2024-04-18T20:14:51","published_time_epoch":1713471291,"published_time_original":"2024-04-18 20:14:51+00:00","replied_tweet":{"author":"Twitter User","author_userid":"1053198649700827136","author_username":"twitteruser"},"spacy_annotations":{"de_core_news_lg":{"noun_chunks":[{"text":"@twitteruser","start_char":0,"end_char":9},{"text":"more exapmle text","start_char":20,"end_char":34},{"text":"Gel","start_char":40,"end_char":43},{"text":"Haar","start_char":47,"end_char":51}],"named_entities":[{"text":"@twitteruser","start_char":0,"end_char":9,"label_":"MISC"}]},"xx_ent_wiki_sm":{"named_entities":{}},"da_core_news_lg":{"noun_chunks":{},"named_entities":{}},"en_core_web_lg":{"noun_chunks":{},"named_entities":{}},"fr_core_news_lg":{"noun_chunks":{},"named_entities":{}},"it_core_news_lg":{"noun_chunks":{},"named_entities":{}},"pl_core_news_lg":{"named_entities":{}},"es_core_news_lg":{"noun_chunks":{},"named_entities":{}},"fi_core_news_lg":{"noun_chunks":{},"named_entities":{}}},"tweet_id":"1781053802398814682","hashtags":{},"outlinks":{},"quoted_tweet":{"outlinks":{},"hashtags":{},"mentioning/replying":{},"replied_tweet":{}}}]

[{"__crawled_url":"https://twitter.com/example2","theme_topic":{"w1_balanced":{"label":"__label__a","confidence":0.3981},"w5_balanced":{"label":"__label__c","confidence":1}},"author":"author2","author_userid":"116712288","author_username":"author2","canonical_url":"https://twitter.com/example2","collected_by":"User","collection_method":"tweety 1.0.9.4","collection_time":"2024-05-27T14:40:32","collection_time_epoch":1716813632,"isquoted":false,"isreply":true,"isretweet":false,"language":"de","mentioning/replying":"twitteruser","num_likes":"0","num_retweets":"0","plain_text":"@twitteruser here is another exmaple text 🤔","published_time":"2024-04-18T20:14:51","published_time_epoch":1713471291,"published_time_original":"2024-04-18 20:14:51+00:00","replied_tweet":{"author":"Twitter User","author_userid":"1053198649700827136","author_username":"twitteruser"},"spacy_annotations":{"de_core_news_lg":{"noun_chunks":[{"text":"@twitteruser","start_char":0,"end_char":9},{"text":"more exapmle text","start_char":20,"end_char":34},{"text":"Gel","start_char":40,"end_char":43},{"text":"Haar","start_char":47,"end_char":51}],"named_entities":[{"text":"@twitteruser","start_char":0,"end_char":9,"label_":"MISC"}]},"xx_ent_wiki_sm":{"named_entities":{}},"da_core_news_lg":{"noun_chunks":{},"named_entities":{}},"en_core_web_lg":{"noun_chunks":{},"named_entities":{}},"fr_core_news_lg":{"noun_chunks":{},"named_entities":{}},"it_core_news_lg":{"noun_chunks":{},"named_entities":{}},"pl_core_news_lg":{"named_entities":{}},"es_core_news_lg":{"noun_chunks":{},"named_entities":{}},"fi_core_news_lg":{"noun_chunks":{},"named_entities":{}}},"tweet_id":"1781053802398814682","hashtags":{},"outlinks":{},"quoted_tweet":{"outlinks":{},"hashtags":{},"mentioning/replying":{},"replied_tweet":{}}}]

Try 1

import json
import csv

data = "C:/Users/Sample-tweets.ndjson"
json_data = json.loads(data)
csv_file ="try3.csv"
csv_obj = open(csv_file, "w")
csv_writer = csv.writer(csv_obj)
header = json_data[0].keys()
csv_writer.writerow(header)
for item in json_data:
    csv_writer.writerow(item.values())
csv_obj.close()
#raise JSONDecodeError("Expecting value", s, err.value) from None
#json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Try 2

import json
import csv

with open('Sample-tweets.ndjson', encoding="utf8") as ndfile:
data = json.load(ndfile)

csv_data = data['emp_details']
data_file = open('try1.csv', 'w', encoding="utf8")
csv_writer = csv.writer(data_file)
count = 0
for data in csv_data:
    if count == 0:
        header = emp.keys()
csv_writer.writerow(header) #spacing error?! can't even run the script 
        count += 1
    csv_writer.writerow(emp.values())
data_file.close()

with open('Sample-tweets.ndjson', encoding="utf8") as ndfile:
jsondata = json.load(ndfile)

data_file = open('try2.csv', 'w', newline='', encoding="uft8")
csv_writer = csv.writer(data_file)

count = 0
for data in ndfile:
if count == 0:
header = data.keys()
csv_writer.writerow(header)
count += 1
csv_writer.writerow(data.values())
data_file.close()
#error message: raise JSONDecodeError("Extra data", s, end)
#json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 1908)

Try 3 to see if the dictionary works

import json

output_lines=[]
with open('C:/Users/Sample1-tweets.ndjson', 'r', encoding="utf8") as f:
    json_in=f.read()
json_in=json.loads(json_in)
print(json_in[2])
#error message: raise JSONDecodeError("Extra data", s, end)
#json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 1908)
#->same error message as above

Thanks a lot for your help!

Upvotes: 0

Views: 63

Answers (0)

Related Questions