Reputation: 17
I’m trying to implement an MP3-like compression algorithm for audio and have followed the general steps, but I’m encountering a few issues with the quantization step. Here's the overall process I'm following:
The process works fine, but I’m running into problems during the quantization step. When I remove any number of coefficients (to reduce data), the audio quality degrades significantly. I’ve also tried converting the coefficient array from float to int32 or int16, but the output file size remains the same as the input file size.
I’m looking for insights into what I might be missing or suggestions on how to improve quantization and file size reduction. Here is the code I’m using:
import os
import numpy as np
import scipy.io.wavfile as wav
from scipy.fftpack import dct, idct
import mp3funcs as mp
file_path = os.path.abspath('Projekt_1/audio.wav')
fs, audio = wav.read(file_path)
left_audio = audio[:, 0]
right_audio = audio[:, 1]
window_size = 512
hop_size = window_size // 2
hanning_window = np.hanning(window_size)
def apply_hanning(audio):
num_frames = (len(audio) - window_size) // hop_size + 1
windowed_audio = np.zeros(len(audio), dtype=np.float64)
for i in range(num_frames):
start = i * hop_size
end = start + window_size
windowed_audio[start:end] += audio[start:end] * hanning_window
return windowed_audio
left_audio = apply_hanning(left_audio)
right_audio = apply_hanning(right_audio)
left_audio = mp.mp3_forward_fbt(left_audio)
right_audio = mp.mp3_forward_fbt(right_audio)
left_dct = dct(left_audio, type=2, n=None, axis=-1, norm='ortho')
right_dct = dct(right_audio, type=2, n=None, axis=-1, norm='ortho')
# quantization here
left_quant = []
right_quant = []
padding_needed_left = (32 - (len(left_quant) % 32)) % 32
left_quant = np.pad(left_quant, (0, padding_needed_left), mode='constant')
padding_needed_right = (32 - (len(right_quant) % 32)) % 32
right_quant = np.pad(right_quant, (0, padding_needed_right), mode='constant')
left_audio = idct(left_quant, type=2, n=None, axis=-1, norm='ortho')
right_audio = idct(right_quant, type=2, n=None, axis=-1, norm='ortho')
left_audio = mp.mp3_reverse_fbt(left_audio)
right_audio = mp.mp3_reverse_fbt(right_audio)
min_length = min(len(left_audio), len(right_audio))
left_audio = left_audio[:min_length]
right_audio = right_audio[:min_length]
audio = np.column_stack((left_audio, right_audio))
output_file_path = 'Projekt_1/output.wav'
wav.write(output_file_path, fs, audio.astype(np.int16))
# Test
print("Size difference:", os.path.getsize(output_file_path) - os.path.getsize(file_path))
Upvotes: 1
Views: 35