Reputation: 11
I have been having issues where my data is in the wrong fields. I have a few large csv files that I have to manually update before loading into QLIK. The csv's have a comma (,) delimiter & double quote (") text qualifier. Some data has extra characters that throw it off and results in numeric numbers in text fields and vice versa. Can someone please advise the best/fastest way to combat this? To remove the unwanted " and save me from manually deleting quotes and pasting to correct fields for hundreds of records. I have created dummy data below.
Please note I am bit limited with the tools I have available to clean the csv. Or could you please advise the best tools/applications needed for this? Just unsure where to start
IN NOTEPAD:
ID,T_No,T_Type,T_Date,T_Name,T_TNo,
2,256,House,30/05/2021,Airport,75.1,
3,268,Hotel,31/05/2021,Hotel Antel""",76.1
4,269,House,31/05/2021,Bank of USA,"LA Branch""""",77.1
IN EXCEL: [enter image description here][1]
Any assistance is greatly appreciated.
Thank you [1]: https://i.sstatic.net/vyYAT.png
Upvotes: 1
Views: 1694
Reputation: 2631
If you have access to C# (there is a free version) you could process the file and fix the bad records. I would do that by figuring out first if there is an issue, and if there is then. Figure out where the name field starts and ends and fix the issues with the quotes.
This would be a good starting point:
private void UpdateCsv()
{
var lines = System.IO.File.ReadAllLines("your file");
var updatedLines = new List<string>();
foreach (var line in lines)
{
//fixes issue with your first example
var newLine = line.TrimEnd(',');
var fixedString = "";
if (newLine.Split(",").Length == 6) //indicates there are no issues
{
fixedString = newLine;
}
else
{
//get the start of the name field
var startName = IndexOfOccurence(newLine, ",", 4) + 1;
//get the end of the name field
var endName = newLine.LastIndexOf(',') + 1;
//populate a new string to hold the fixed data
fixedString = newLine.Substring(0, startName);
//parse the name field based on start and end
var name = newLine.Substring(startName, endName - startName - 1);
//get rid of starting and ending quotes
name = name.TrimStart('"').TrimEnd('"');
//what to do with quote in middle of string? escape or remove your choice uncomment your choice
//name = name.Replace('"', ' '); //to remove
//name = name.Replace("\"", "\"\""); //to escape
//if name contains comma or quote then add quote, else not needed
if (name.Contains(',') || name.Contains('"'))
{
fixedString += "\"" + name + "\"" + newLine.Substring(endName - 1);
}
else
{
fixedString += name + newLine.Substring(endName - 1);
}
}
updatedLines.Add(fixedString);
}
//write out the updated data
System.IO.File.WriteAllLines("your file", updatedLines);
}
private int IndexOfOccurence(string s, string match, int occurence)
{
int i = 1;
int index = 0;
while (i <= occurence && (index = s.IndexOf(match, index + 1)) != -1)
{
if (i == occurence)
return index;
i++;
}
return -1;
}
Upvotes: 0
Reputation: 9074
If the issue is just with the T_Name
column, you could set the mode to CsvMode.NoEscape
, use the ClassMap
to get the fields you know you can get without issue and then use some logic to figure out where the T_Name
column ends and the T_TNo
column starts. There is a lot that could break in this code, depending on what the rest of the data looks like, but it should at least give you some ideas.
void Main()
{
var text = new StringBuilder();
text.AppendLine("ID,T_No,T_Type,T_Date,T_Name,T_TNo,");
text.AppendLine("2,256,House,30/05/2021,Airport,75.1,");
text.AppendLine("3,268,Hotel,31/05/2021,Hotel Antel\"\"\",76.1");
text.AppendLine("4,269,House,31/05/2021,Bank of USA,\"LA Branch\"\"\"\"\",77.1");
var config = new CsvConfiguration(CultureInfo.InvariantCulture)
{
Mode = CsvMode.NoEscape
};
using (var reader = new StringReader(text.ToString()))
using (var csv = new CsvReader(reader, config))
{
var options = new TypeConverterOptions { Formats = new[] { "dd/MM/yyyy" } };
csv.Context.TypeConverterOptionsCache.AddOptions<DateTime>(options);
csv.Context.RegisterClassMap<MyClassMap>();
var records = new List<MyClass>();
csv.Read();
csv.ReadHeader();
while (csv.Read())
{
var record = csv.GetRecord<MyClass>();
var name = string.Empty;
int i = 4;
var finished = false;
while (!finished)
{
var field = csv.GetField(i);
if (i == 4)
{
record.Name = field.Replace("\"", "");
i++;
continue;
}
var isNumber = float.TryParse(field, out var number);
if (!isNumber)
{
record.Name += ", " + field.Replace("\"", "");
i++;
continue;
}
record.TNumber = number;
finished = true;
}
records.Add(record);
}
records.Dump();
}
}
public class MyClassMap : ClassMap<MyClass>
{
public MyClassMap()
{
Map(x => x.Id).Name("ID");
Map(x => x.Number).Name("T_No");
Map(x => x.Type).Name("T_Type");
Map(x => x.Date).Name("T_Date");
}
}
public class MyClass
{
public int Id { get; set; }
public int Number { get; set; }
public string Type { get; set; }
public DateTime Date { get; set; }
public string Name { get; set; }
public float TNumber { get; set; }
}
Upvotes: 0