Reputation: 35
I have a situation where I am trying to read massive binary files. The files contains millions of key value pairs of pure integers(long). So I can read one at at time with binary reader with ReadInt64() but this is time consuming and ends up being slower than reading the same text file as plain text. Does anyone know a better way to read binary files quickly with key value pairs on integers? I got something to work a little faster with a buffer but its still tricky to read in each long. There has to be a better way. Any help is appreciated! Thanks
Upvotes: 2
Views: 759
Reputation: 2069
Tested Matthews Watson's code (which should be acknowledged as answer)
It's fast indeed ! I added the DllImport neccesary for ReadFile (Kernel32) and avoid using unsafe by introducing an empty NativeOverlapped ref to replace the IntPtr.Zero
The file under test is 5.9Mb html, read as byte[] chunk in 3-5ms from SSD
class Program
{
static void Main(string[] args)
{
Stopwatch sw = new Stopwatch();
sw.Start();
byte[] b = TestIt.FastRead<byte>(new FileStream("Tracking Covid-19 cases in the US.htm", FileMode.Open), 10000000);
sw.Stop();
Debug.WriteLine("b.Length=" + b.Length +" sw="+sw.ElapsedMilliseconds);
for (int i = 0; i < 200; i++) Debug.Write((char)b[i]);
Debug.WriteLine("==");
Debug.WriteLine("==");
for (int i = 0; i < 200; i++) Debug.Write((char)b[b.Length-200+i]); // END </html>
Console.ReadKey();
}
}
public static class TestIt
{
// https://stackoverflow.com/questions/66789631/fastest-way-to-read-large-binary-file-into-array-of-int-in-c-sharp/67332253#67332253
[DllImport("kernel32.dll", SetLastError = true)]
static extern bool ReadFile(Microsoft.Win32.SafeHandles.SafeFileHandle hFile, [Out] IntPtr lpBuffer, uint nNumberOfBytesToRead,
out uint lpNumberOfBytesRead, [In] ref System.Threading.NativeOverlapped lpOverlapped);
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Reliability", "CA2004:RemoveCallsToGCKeepAlive")]
public static T[] FastRead<T>(FileStream fs, int count) where T : struct
{
int sizeOfT = Marshal.SizeOf(typeof(T));
long bytesRemaining = fs.Length - fs.Position;
long wantedBytes = count * sizeOfT;
long bytesAvailable = Math.Min(bytesRemaining, wantedBytes);
long availableValues = bytesAvailable / sizeOfT;
long bytesToRead = (availableValues * sizeOfT);
if ((bytesRemaining < wantedBytes) && ((bytesRemaining - bytesToRead) > 0))
Debug.WriteLine("Requested data exceeds available data and partial data remains in the file.");
T[] result = new T[availableValues];
GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned);
var ipp = new System.Threading.NativeOverlapped(); // need this with above pInvoke
try
{
uint bytesRead;
if (!ReadFile(
fs.SafeFileHandle,
gcHandle.AddrOfPinnedObject(),
(uint)bytesToRead,
out bytesRead, ref ipp))
{
throw new IOException("Unable to read file.", new Win32Exception(Marshal.GetLastWin32Error()));
}
Debug.Assert(bytesRead == bytesToRead);
}
finally
{
gcHandle.Free();
}
GC.KeepAlive(fs);
return result;
}
}
}
}
Upvotes: 2
Reputation: 109762
If you only want to run the application on Windows, you can speed things up a bit.
Note: The following code only works if T
is a primitive value type:
/// <summary>
/// Reads array data from a file stream as quickly as possible,
/// without making any additional copies of the data.
/// </summary>
/// <typeparam name="T">The type of the array elements.</typeparam>
/// <param name="fs">The file stream from which to read.</param>
/// <param name="count">The number of elements to read.</param>
/// <returns>
/// The array of elements that was read. This may be less than the number that was
/// requested if the end of the file was reached. It may even be empty.
/// NOTE: There may still be data left in the file, even if not all the requested
/// elements were returned - this happens if the number of bytes remaining in the
/// file is less than the size of the array elements.
/// </returns>
/// <exception cref="IOException">Thrown on error. See inner exception for <see cref="Win32Exception"/></exception>
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Reliability", "CA2004:RemoveCallsToGCKeepAlive")]
public static T[] FastRead<T>(FileStream fs, int count) where T: struct
{
int sizeOfT = Marshal.SizeOf(typeof(T));
long bytesRemaining = fs.Length - fs.Position;
long wantedBytes = count * sizeOfT;
long bytesAvailable = Math.Min(bytesRemaining, wantedBytes);
long availableValues = bytesAvailable / sizeOfT;
long bytesToRead = (availableValues * sizeOfT);
if ((bytesRemaining < wantedBytes) && ((bytesRemaining - bytesToRead) > 0))
Debug.WriteLine("Requested data exceeds available data and partial data remains in the file.");
T[] result = new T[availableValues];
GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned);
try
{
uint bytesRead;
if (!ReadFile(
fs.SafeFileHandle,
gcHandle.AddrOfPinnedObject(),
(uint)bytesToRead,
out bytesRead,
IntPtr.Zero))
{
throw new IOException("Unable to read file.", new Win32Exception(Marshal.GetLastWin32Error()));
}
Debug.Assert(bytesRead == bytesToRead);
}
finally
{
gcHandle.Free();
}
GC.KeepAlive(fs);
return result;
}
Upvotes: 2