Reputation: 658
I implemented the following two functions for RLE compression of binary files.
char* RLEcompress(char* data, size_t origSize, size_t* compressedSize) {
char* ret = calloc(2 * origSize, 1);
size_t retIdx = 0, inIdx = 0;
size_t retSize = 0;
while (inIdx < origSize) {
size_t count = 1;
size_t contIdx = inIdx;
while (contIdx < origSize - 1 && data[inIdx] == data[++contIdx]) {
count++;
}
size_t tmpCount = count;
// break down counts with 2 or more digits into counts ≤ 9
while (tmpCount > 9) {
tmpCount -= 9;
ret[retIdx++] = data[inIdx];
ret[retIdx++] = data[inIdx];
ret[retIdx++] = '9';
retSize += 3;
}
ret[retIdx++] = data[inIdx];
retSize += 1;
if (tmpCount > 1) {
// repeat character (this tells the decompressor that the next digit
// is in fact the # of consecutive occurrences of this char)
ret[retIdx++] = data[inIdx];
// convert single-digit count to dataing
ret[retIdx++] = '0' + tmpCount;
retSize += 2;
}
inIdx += count;
}
*compressedSize = retSize;
return ret;
}
char* RLEdecompress(char* data, size_t compressedSize, size_t uncompressedSize, size_t extraAllocation) {
char* ret = calloc(uncompressedSize + extraAllocation, 1);
size_t retIdx = 0, inIdx = 0;
while (inIdx < compressedSize) {
ret[retIdx++] = data[inIdx];
if (data[inIdx] == data[inIdx + 1]) { // next digit is the # of occurrences
size_t occ = ((data[inIdx + 2]) - '0');
for (size_t i = 1; i < occ && retIdx < compressedSize; i++) {
ret[retIdx++] = data[inIdx];
}
inIdx += 2;
}
inIdx += 1;
}
return ret;
}
They seem to work fine, i.e. diff
doesn't produce any output when comparing the original files to the compressed-then-uncompressed versions.
However, every once in a while, the files will differ indicating there is a bug somewhere. I haven't been able to find a pattern in the files that exhibit this, but I'll give you an example of what the difference looks like.
The lower one is the original.
As you can see, the byte 21 is repeated twice in the compressed-then-uncompressed version. I haven't been able to identify the issue. Unfortunately the bug happens with very few files: so far I've only observed it with two pdf files, including the one shown above, but I can't share them because it's copyrighted content, but I'm working on coming up with another file that fails so I can provide you with an example.
I have a feeling there is something "obvious" wrong with the code above and I'm just missing it. Help is greatly appreciated.
EDIT:
Here's a test program I'm using to read the offending file, compressing it, then decompressing it. I'm also saving the compressed one to disk in a middle step to have more debug data.
int main(int argc, char** argv) {
size_t compsz;
FILE* fp = fopen(argv[1], "r");
if (!fp) {
perror("fp");
return 1;
}
if (fseek(fp, 0L, SEEK_END) == -1) {
return -1;
}
// get file size
size_t filecontentLen = ftell(fp);
if (filecontentLen < 0) {
return -1;
}
rewind(fp);
char* filecontentBuf = calloc(filecontentLen, 1);
if (!filecontentBuf) {
fclose(fp);
errno = ENOMEM;
return -1;
}
// read original
if (fread(filecontentBuf, sizeof(char), filecontentLen, fp) <= 0) {
int errnosave = errno;
if (ferror(fp)) {
fclose(fp);
free(filecontentBuf);
errno = errnosave;
return -1;
}
}
// write compressed
char* compressed = RLEcompress(filecontentBuf, filecontentLen, &compsz);
FILE* fpcompWrite = fopen("compressed", "w+");
if (fwrite(compressed, compsz, 1, fpcompWrite) == -1) {
perror("fwrite");
}
fclose(fpcompWrite);
// read compressed
FILE* fpcompRead = fopen("compressed", "r");
if (!fpcompRead) {
perror("fpcompRead");
return 1;
}
char* compBuf = calloc(compsz * 2, 1);
fread(compBuf, compsz, 1, fpcompRead);
fclose(fpcompRead);
// decompress and write file
char* uncompBuf = RLEdecompress(compBuf, compsz, filecontentLen, 0);
FILE* funcomp = fopen("uncompressed", "w+");
fwrite(uncompBuf, filecontentLen, 1, funcomp);
fclose(funcomp);
}
Upvotes: 2
Views: 105
Reputation: 66
I think the problem is that
for (size_t i = 1; i < occ && retIdx < compressedSize; i++) {
ret[retIdx++] = data[inIdx];
}
should be changed in
for (size_t i = 1; i < occ && retIdx < uncompressedSize; i++) {
ret[retIdx++] = data[inIdx];
}
in the decompression algorithm, since redIdx
is bounded by uncompressedSize
, and maybe in some rare cases it copies fewer bytes than it should.
Upvotes: 2