Reputation: 195
I need to insert 1.9 million new records into a MySQL database. To use this I'm using C# Entity Framework, but the process seems incredibly slow. At the current rate, it would take several days to process these records.
What am I doing wrong and how do I speed this up?
In the database I have 2 tables: Hashes and Categories. Each hash should be unique and can have multiple categories, with only 1 category being active per hash.
The process that I need to follow is to first check if the hash exist. If it does, then I need to find the current category, deactivate it and add the new one.
The problem is that my try{ }
statement is taking about 150ms and the block that does SaveChanges()
takes about 15-30 seconds. So, doing 1.9M records this way will take several days.
using (var reader = new StreamReader(File.OpenRead(filepath)))
using (MySQLContext db = new MySQLContext(options))
{
// Disable auto detect changes
db.ChangeTracker.AutoDetectChangesEnabled = false;
int loopCounter = 0;
string line;
// Load up the db tables in memory
var hashes = db.Hashes.Select(x => x).ToList();
var category = db.Categories.Select(a => a).ToList();
while ((line = reader.ReadLine()) != null)
{
var matches = Regex.Matches(line, "(?<MD5>[a-zA-Z0-9]+)(?<Category>[0-9])");
InputHashModel inputHash = new InputHashModel()
{
MD5 = matches[0].Groups["MD5"].Value,
Category = matches[0].Groups["Category"].Value
};
try
{
// Check if hash already exists
Hash hash = hashes.Where(h => h.MD5 == inputHash.MD5).FirstOrDefault();
// If hash doesn't exist - add it
if (hash == null)
hash = new Hash(inputHash.MD5);
else
{
// Check if category already exists
Category category = categories.Where(a => a.Active == true && a.HashId == hash.Id).FirstOrDefault();
// If it exists - deactivate it
if (category != null)
{
// If the same category already exists - proceed to next hash
if (category.Source == "ThisInput" && category.Category == inputHash.Category)
{
loopCounter++
continue;
}
category.Active = false;
category.DeactivatedTimestamp = DateTime.Now;
}
}
// Add new category
Category new_category = new Category() { Hash = hash, Source = "ThisInput", Category = inputHash.Category, Active = true);
db.Categories.Add(new_category);
// Save changes every 1000
if (loopCounter % 1000 == 0)
{
db.ChangeTracker.DetectChanges();
db.SaveChanges();
}
}
catch (Exception e)
{
Console.WriteLine("Exception: " + e);
}
loopCounter++;
}
db.ChangeTracker.AutoDetectChangesEnabled = true;
db.SaveChanges();
Console.WriteLine("Finished");
}
Upvotes: 3
Views: 4598
Reputation: 195
I managed to achieve this through the use of the BulkInsert()
and BulkUpdate()
functions from EntityFramework Extensions.
I went about it a slightly different way though ...
.Intersects(myCustomComparer)
. Do that twice to separate unique and duplicate entries into two lists.Here's the code for the functions:
public class HashMD5Comparer : IEqualityComparer<Hash>
{
//Products are equal if their names and product numbers are equal.
public bool Equals(Hash x, Hash y)
{
//Check whether the compared objects reference the same data.
if (Object.ReferenceEquals(x, y)) return true;
//Check whether any of the compared objects is null.
if (Object.ReferenceEquals(x, null) || Object.ReferenceEquals(y, null))
return false;
//Check whether the hash' properties are equal.
return x.MD5 == y.MD5;
}
// If Equals() returns true for a pair of objects
// then GetHashCode() must return the same value for these objects.
public int GetHashCode(Hash hash)
{
//Check whether the object is null
if (Object.ReferenceEquals(hash, null)) return 0;
//Get hash code for the Name field if it is not null.
int hashMD5 = hash.MD5 == null ? 0 : hash.MD5.GetHashCode();
//Calculate the hash code for the hash.
return hashMD5;
}
}
public class HashComparer
{
private static Hash[] uniqueHashes;
private static Hash[] duplicateHashes;
private static Hash[] duplicateHashesInDb;
private static void SortHashes(Hash[] hashes)
{
Hash[] hashesInDatabase;
// Download hashes from database
using (MySQLContext db = new MySQLContext())
{
hashesInDatabase = db.Hashes.Where(h => h.MD5 != null).ToArray();
}
// Find duplicates in database
duplicateHashes = hashes.Intersect(hashesInDatabase, new HashMD5Comparer()).ToArray();
duplicatehashesInDatabase = hashesInDatabase.Intersect(hashes, new HashMD5Comparer()).ToArray();
// Find uniques in database
uniqueHashes = hashes.Except(duplicateHashes, new HashMD5Comparer()).ToArray();
}
private static void ActionDuplicateHashes()
{
Assessment[] assessmentsInDatabase;
List<Assessment> assessmentsToDeactivate = new List<Assessment>();
List<Assessment> assessmentsToAdd = new List<Assessment>();
// Download assessments from database
using (MySQLContext db = new MySQLContext(GenerateMySQLOptions()))
{
var duplicateHashIds = duplicateHashesInDb.Select(h => h.Id).ToArray();
assessmentsInDatabase = db.Assessments.Where(a => duplicateHashIds.Contains(a.HashId)).ToArray();
}
foreach (var inputHash in duplicateHashes)
{
// Lookup the hash in the database to get the ID
var liveHashId = Array.Find(duplicateHashesInDb, h => h.MD5 == inputHash.MD5).Id;
// Find the assessment in the database to compare (and deactive if needed)
var liveAsssessment = Array.Find(assessmentsInDatabase, a => a.HashId == liveHashId);
// Get the new assessment of the hash
var newAssessment = inputHash.Assessments.FirstOrDefault();
if (newAssessment == null)
{
Console.WriteLine($"Failed lookup for new assessment {inputHash.MD5}");
return;
}
// Set the hashId (relationship) for the new assessment
newAssessment.HashId = liveHashId;
if (liveAsssessment != null)
{
if (liveAsssessment.Origin == newAssessment.Origin &&
liveAsssessment.Category == newAssessment.Category)
{
// Exact duplicate - leave as is
}
else
{
// Deactivate the current assessment in the database
liveAsssessment.Active = false;
// Add the assessment to a list to deactive (update)
assessmentsToDeactivate.Add(liveAsssessment);
// Add the new assessment that will be added once the old one gets deactivated
assessmentsToAdd.Add(newAssessment);
}
}
else
{
// No assessment for the hash in the database - just add a new one
assessmentsToAdd.Add(newAssessment);
}
}
// Bulk update the assessments in the database that are to be deactivated
using (MySQLContext db = new MySQLContext(GenerateMySQLOptions()))
{
db.Assessments.BulkUpdate(assessmentsToDeactivate);
}
// Bulk insert the new assessments
using (MySQLContext db = new MySQLContext(GenerateMySQLOptions()))
{
db.Assessments.BulkInsert(assessmentsToAdd);
}
}
private static void ActionUniqueHashes()
{
// Bulk insert all unique hashes and their assessments
using (MySQLContext db = new MySQLContext())
{
// options.IncludeGraph adds any relationships to the database as well
db.Hashes.BulkInsert(uniqueHashes, options => options.IncludeGraph = true);
}
}
}
There is more optimisation to be done because this uses A LOT of RAM. A specially when doing the unique hashes bulk insert (not sure why). But all in all it works.
Upvotes: 0
Reputation: 89416
This is never going to be the fastest method, but at a minimum you need to aviod accumulating all the entities in the change tracker. EG after each SaveChanges() run
foreach (var e in db.ChangeTracker.Entries())
{
e.State = EntityState.Detached;
}
Upvotes: 2