Reputation:
I have a deep learning code written in python (Anaconda3, Ubuntu 16.04). It basically does ad detection for a given video based on a trained model, must return if the video is ad or not (we assume it is only a single-shot video). My colleague who is unavailable right now wrote this.
The original file had a loop somewhere (for multiple shot) iterating over the list of the shots, and now that loop is removed assuming the video is a single shot. But looks some array sizes are messed up, hence the error. How to fix the problem?
I'm not familiar with python and deep learning, and it should be general python programming issues, not the semantics because it worked fine just before.
Here is the error:
File "/Ad_module_textfast_stream.py", line 36, in label_prediction
pred_labels= clf_trained.predict( mfcc_feat.reshape(-1, 200) )
ValueError: cannot reshape array of size 8640 into shape (200)
So it should run when calling video_audio_extractor(video_name)
. Here is the code. Right now final_label_list
must contain if it is video or ad. And I guess since it is only a single shot, it must contain only one element.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time, cv2, librosa
import skvideo.io
import numpy as np
import tensorflow as tf
import subprocess, os, glob
from sklearn.externals import joblib
#################################### Loading the Dataset
def label_prediction( mfcc_list, num_frames= 3):
num_shots= len(mfcc_list)
pred_labels_list= list()
final_label_list= list()
if 2>1:
clf_trained = joblib.load('trainsvm_mfcc.pkl')
#predicted_train_labels= clf_trained.predict( mfcc_list)
for j in range(num_shots):
mfcc_feat = mfcc_list[j]
num_frames= 1
mfcc_feat= ( mfcc_feat- np.mean(mfcc_feat) ) / ( np.std(mfcc_feat)+ 1e-6 )
#### now access operation you wanna run
#pred_labels= clf_trained.predict( mfcc_feat)
pred_labels= clf_trained.predict( mfcc_feat.reshape(-1, 200) )
final_label= 0 ## ads:1, and video:0
if pred_labels> 0:
final_label= 1
pred_labels_list.append(pred_labels)
final_label_list.append(final_label)
################ post-processing
final_label_list[0]= 0 ### video starts with natural content
final_label_list[len(final_label_list)-1]= 0 ## last shot
for kk in range(2,len(final_label_list)-2): ### one video between many ads, most likely to be ads
if final_label_list[kk]==0 and final_label_list[kk-2]==1 and final_label_list[kk-1]==1 and final_label_list[kk+1]==1 and final_label_list[kk+2]==1:
final_label_list[kk]= 1
return final_label_list, pred_labels_list
def video_audio_extractor( video_name):
cur_video= skvideo.io.vread(video_name)
metadata = skvideo.io.ffprobe(video_name)
vid_info= metadata["video"]
items = list(vid_info.items())
avg_fps_info= items[22][1]
avg_fps= int(avg_fps_info[0:2])
cur_num_frame= cur_video.shape[0]
cur_audio, cur_sr= librosa.load(video_name)
mfcc_list= list()
cur_audioshot_mfcc= librosa.feature.mfcc( y= cur_audio, sr= cur_sr, n_mfcc=20)
cur_audioshot_mfcc_1d= np.reshape( cur_audioshot_mfcc, [cur_audioshot_mfcc.shape[0]*cur_audioshot_mfcc.shape[1],])
mfcc_list.append(cur_audioshot_mfcc_1d)
final_label_list, pred_labels_list= label_prediction( mfcc_list, num_frames= 3)
return mfcc_list, avg_fps, final_label_list
Below is the original video_audio_extractor
function for which contains the for loop:
def video_audio_extractor( video_name):
cur_video= skvideo.io.vread(video_name)
metadata = skvideo.io.ffprobe(video_name)
vid_info= metadata["video"]
items = list(vid_info.items())
avg_fps_info= items[22][1]
avg_fps= int(avg_fps_info[0:2])
cur_num_frame= cur_video.shape[0]
cur_audio, cur_sr= librosa.load(video_name)
cur_shot_name= 'video_shots.txt'
#cur_shot_name= cur_video_name[0:-4]+'_shots.txt'
line = list(open(cur_shot_name, 'r'))
mfcc_list= list()
for shot_ind in range(len(line)):
cur_line= line[ shot_ind]
cur_line_list= cur_line.split()
first_frame= int( cur_line_list[0] )
last_frame = int( cur_line_list[1] )
cur_audioshot_first_ind= int( np.floor(first_frame*len(cur_audio)/cur_num_frame ) )
cur_audioshot_last_ind = int( np.floor(last_frame *len(cur_audio)/cur_num_frame ) )
cur_audioshot= cur_audio[cur_audioshot_first_ind:cur_audioshot_last_ind]
new_rate= 5000*cur_sr/len(cur_audioshot)
cur_audioshot_resampled = librosa.resample(cur_audioshot, cur_sr, new_rate)
cur_audioshot_mfcc= librosa.feature.mfcc(y=cur_audioshot_resampled, sr= new_rate, n_mfcc=20)
cur_audioshot_mfcc_1d= np.reshape( cur_audioshot_mfcc, [cur_audioshot_mfcc.shape[0]*cur_audioshot_mfcc.shape[1],])
mfcc_list.append(cur_audioshot_mfcc_1d)
return mfcc_list, line, avg_fps
Upvotes: 0
Views: 136
Reputation: 5936
I think the problem is that your old video_audio_extractor
code changes the sampling rate and resamples the audio. Here's the code:
new_rate= 5000*cur_sr/len(cur_audioshot)
cur_audioshot_resampled = librosa.resample(cur_audioshot, cur_sr, new_rate)
cur_audioshot_mfcc= librosa.feature.mfcc(y=cur_audioshot_resampled, sr= new_rate, n_mfcc=20)
Your new video_audio_extractor
doesn't use this new rate or do any resampling. That's why the new shape of mfcc_feat
is different than the old shape of mfcc_feat
. What happens if you replace the following line:
pred_labels= clf_trained.predict( mfcc_feat.reshape(-1, 200) )
with this line:
pred_labels= clf_trained.predict( mfcc_feat )
Upvotes: 0