Reputation: 83
Using the below code snippet on the .csv dataset threw a huge error as shown. I have tried with many suggestions but no success either.
import numpy as np
import pandas as pd
import pickle
#from prettytable import PrettyTable
import csv
my_udn_f = "20200914181913_18511.csv"
df = pd.read_csv(my_udn_f,sep=',', engine="python")
Result:
[---------------------------------------------------------------------------
ParserError Traceback (most recent call last)
<ipython-input-108-924767287934> in <module>
7 my_udn_f = "20200914181913_18511.csv"
8
----> 9 df = pd.read_csv(my_udn_f,sep=',', engine="python")
~/anaconda3/envs/kaligs_analytic1/lib/python3.7/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
674 )
675
--> 676 return _read(filepath_or_buffer, kwds)
677
678 parser_f.__name__ = name
~/anaconda3/envs/kaligs_analytic1/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
452
453 try:
--> 454 data = parser.read(nrows)
455 finally:
456 parser.close()
~/anaconda3/envs/kaligs_analytic1/lib/python3.7/site-packages/pandas/io/parsers.py in read(self, nrows)
1131 def read(self, nrows=None):
1132 nrows = _validate_integer("nrows", nrows)
-> 1133 ret = self._engine.read(nrows)
1134
1135 # May alter columns / col_dict
~/anaconda3/envs/kaligs_analytic1/lib/python3.7/site-packages/pandas/io/parsers.py in read(self, rows)
2456 content = content\[1:\]
2457
-> 2458 alldata = self._rows_to_cols(content)
2459 data = self._exclude_implicit_index(alldata)
2460
~/anaconda3/envs/kaligs_analytic1/lib/python3.7/site-packages/pandas/io/parsers.py in _rows_to_cols(self, content)
3111 msg += ". " + reason
3112
-> 3113 self._alert_malformed(msg, row_num + 1)
3114
3115 # see gh-13320
~/anaconda3/envs/kaligs_analytic1/lib/python3.7/site-packages/pandas/io/parsers.py in _alert_malformed(self, msg, row_num)
2870
2871 if self.error_bad_lines:
-> 2872 raise ParserError(msg)
2873 elif self.warn_bad_lines:
2874 base = f"Skipping line {row_num}: "
ParserError: Expected 39 fields inline 301474, saw 41][1]
However, when I used the below code snippet, it runs but excludes 2 rows which I wouldn't want them excluded. Looking for more techniques to help tackle this problem at hand.
import numpy as np
import pandas as pd
import pickle
#from prettytable import PrettyTable
import csv
my_udn_f = "20200914181913_18511.csv"
df = pd.read_csv(my_udn_f,sep=',', engine="python", error_bad_lines=False)
Skipping line 301474: Expected 39 fields in line 301474, saw 41
Skipping line 313425: Expected 39 fields in line 313425, saw 41er code here
Upvotes: 1
Views: 1197
Reputation: 62403
'Request_Content_Type'
, are not properly formatted."..."
.# line 301474
1600096774,546,1600096776,220,0,641101909494671,256779102447,357909083208490,10.152.148.242,27.86.13.208,38577,80,6,internet,1024,1,212.88.127.3,212.88.127.153,"",64110177B51C3,64110177B06FF,"","","",685,431,6,5,4,249,359,78,"au-cs0.kddi.com/support/myau/fixedvalue/fixedvalue_version.xml","Mozilla/5.0 (Linux; U; Android 7.1.1; ja-jp; KYV42) auCSApp/6.2.0",302,0,"",0,application/x-www-form-urlencoded,text/html,application/xhtml+xm
# add "" around the ending bit
,"application/x-www-form-urlencoded,text/html,application/xhtml+xm"
# line 313425
1600096776,206,1600096777,461,0,641101909494671,256779102447,357909083208490,10.152.148.242,111.87.221.97,40494,80,6,internet,1024,1,212.88.127.3,212.88.127.153,"",64110177B51C3,64110177B06FF,"","","",698,752,6,5,4,249,359,78,"my.au.com/rd/ac0/support/myau/fixedvalue/fixedvalue_version.xml","Mozilla/5.0 (Linux; U; Android 7.1.1; ja-jp; KYV42) auCSApp/6.2.0",302,0,"",0,application/x-www-form-urlencoded,text/html,application/xhtml+xm
# add "" around the ending bit
,"application/x-www-form-urlencoded,text/html,application/xhtml+xm"
# read the file
df = pd.read_csv('20200914181913_18511.csv', encoding="ISO-8859-1")
# display(df.head())
StartTimeSecond StartTimeMilSec EndTimeSecond EndTimeMilSec AssociationFlag IMSI MSISDN IMEISV MS_IP Server_IP MS_Port Server_Port IP_Protocol APN Charging_Characteristics RAT_Type Serving_Node_IP Gateway_Node_IP CGI SAI RAI TAI ECGI LAI Uplink_Traffic Downlink_Traffic Uplink_Packets Downlink_Packets Protocol_Category Application Sub_Application EGN_Sub_Protocol URL User_Agent Status_Code Response_Content_Length Response_Content_Type Request_Content_Length Request_Content_Type
0 1600096747 942 1600096749 972 0 641101003258952 256774286149 3.52854e+14 10.135.43.67 47.254.56.66 33245 6443 6 internet 1024 1 212.88.127.5 212.88.127.157 NaN 6411017785804 64110177804FF NaN NaN NaN 172 60 3 1 10 580 787 513 NaN NaN NaN NaN NaN NaN NaN
1 1600096749 923 1600096749 943 0 641102948752052 256774832602 3.55953e+14 10.135.224.249 41.210.187.253 64915 53 17 internet 1024 1 212.88.127.5 212.88.127.157 NaN 64110177A45C2 64110177A0AFF NaN NaN NaN 69 155 1 1 11 590 797 79 NaN NaN NaN NaN NaN NaN NaN
2 1600096750 181 1600096750 246 0 641101903995002 256787298917 3.57171e+14 10.130.197.55 10.120.0.138 47014 8080 6 yellopix.mtn.co.ug 512 6 212.88.127.48 212.88.127.157 NaN NaN NaN 641100023 641100006C904 NaN 152 425 3 2 4 806 1109 2016 instagram.febb2-1.fna.fbcdn.net:443 NaN 600.0 0.0 NaN 0.0 NaN
3 1600096750 121 1600096750 185 0 641101903995002 256787298917 3.57171e+14 10.130.197.55 10.120.0.138 47004 8080 6 yellopix.mtn.co.ug 512 6 212.88.127.48 212.88.127.157 NaN NaN NaN 641100023 641100006C904 NaN 152 425 3 2 4 806 1109 2016 i.instagram.com:443 NaN 600.0 0.0 NaN 0.0 NaN
4 1600096743 376 1600096750 134 0 641101914219965 256778856060 3.56676e+14 10.160.38.132 34.194.71.217 56750 80 6 internet 1024 1 212.88.127.5 212.88.127.157 NaN 64110177B3E9B 64110177B08FF NaN NaN NaN 156 100 3 2 3 238 340 1404 c.whatsapp.net/chat Mozilla/5.0 (compatible; WAChat/1.2; +http://www.whatsapp.com/contact) NaN 0.0 NaN 0.0 NaN
df.iloc[301472:301474, :]
StartTimeSecond StartTimeMilSec EndTimeSecond EndTimeMilSec AssociationFlag IMSI MSISDN IMEISV MS_IP Server_IP MS_Port Server_Port IP_Protocol APN Charging_Characteristics RAT_Type Serving_Node_IP Gateway_Node_IP CGI SAI RAI TAI ECGI LAI Uplink_Traffic Downlink_Traffic Uplink_Packets Downlink_Packets Protocol_Category Application Sub_Application EGN_Sub_Protocol URL User_Agent Status_Code Response_Content_Length Response_Content_Type Request_Content_Length Request_Content_Type
301472 1600096774 546 1600096776 220 0 641101909494671 256779102447 3.57909e+14 10.152.148.242 27.86.13.208 38577 80 6 internet 1024 1 212.88.127.3 212.88.127.153 NaN 64110177B51C3 64110177B06FF NaN NaN NaN 685 431 6 5 4 249 359 78 au-cs0.kddi.com/support/myau/fixedvalue/fixedvalue_version.xml Mozilla/5.0 (Linux; U; Android 7.1.1; ja-jp; KYV42) auCSApp/6.2.0 302.0 0.0 NaN 0.0 application/x-www-form-urlencoded,text/html,application/xhtml+xm
301473 1600096775 920 1600096775 923 0 641101006171954 256771861713 3.52289e+14 10.129.61.253 82.145.209.241 36193 1080 6 internet 1024 1 212.88.127.3 212.88.127.153 NaN 64110177ACBFA 64110177A0AFF NaN NaN NaN 0 40 0 1 16 669 922 2 NaN NaN NaN NaN NaN NaN NaN
df.iloc[313423:313425, :]
StartTimeSecond StartTimeMilSec EndTimeSecond EndTimeMilSec AssociationFlag IMSI MSISDN IMEISV MS_IP Server_IP MS_Port Server_Port IP_Protocol APN Charging_Characteristics RAT_Type Serving_Node_IP Gateway_Node_IP CGI SAI RAI TAI ECGI LAI Uplink_Traffic Downlink_Traffic Uplink_Packets Downlink_Packets Protocol_Category Application Sub_Application EGN_Sub_Protocol URL User_Agent Status_Code Response_Content_Length Response_Content_Type Request_Content_Length Request_Content_Type
313423 1600096776 206 1600096777 461 0 641101909494671 256779102447 3.57909e+14 10.152.148.242 111.87.221.97 40494 80 6 internet 1024 1 212.88.127.3 212.88.127.153 NaN 64110177B51C3 64110177B06FF NaN NaN NaN 698 752 6 5 4 249 359 78 my.au.com/rd/ac0/support/myau/fixedvalue/fixedvalue_version.xml Mozilla/5.0 (Linux; U; Android 7.1.1; ja-jp; KYV42) auCSApp/6.2.0 302.0 0.00000e+00 NaN 0.00000e+00 application/x-www-form-urlencoded,text/html,application/xhtml+xm
313424 1600096746 725 1600096777 916 0 641101907765025 256775460010 3.54450e+14 10.129.26.81 51.15.204.181 53258 443 6 internet 1024 6 212.88.127.3 212.88.127.153 NaN NaN NaN 641100032 6411000076703 NaN 134 116 2 2 4 250 360 430 NaN NaN NaN 4.29497e+09 NaN 4.29497e+09 NaN
'Request_Content_Type'
row.row
row
and saved into the correct index, i
, in data
data
is then saved to a '*_fixed.csv'
file.import csv
with open('20200914181913_18511.csv', encoding="ISO-8859-1") as f, open('20200914181913_18511_fixed.csv', 'w', newline='') as fixed:
# load the rows into an object that can be updated and saved
data = list(csv.reader(f, delimiter=','))
# fix rows that are to long
# assumes this will always be the same issue with the end of the row
for i, row in enumerate(data):
if i == 0:
header_len = len(row)
if len(row) > header_len:
data[i] = row[:header_len-1] + [', '.join(row[-(len(row)+1-header_len):])]
writer = csv.writer(fixed, delimiter=',')
writer.writerows(data)
Upvotes: 1