Reputation: 1
I've got 200GB folders with images and some of them can't be opened. I want to find these images and delete them from their folder.
I tried Python code like this:
for image in all_image:
try: # open image
except: # delete image
And it's too slow. How I can do it faster?
How I can parallelize this code?
import PIL
import os
import cv2
from PIL import ImageFile
from tqdm import tqdm
from pathlib import Path
import pandas as pd
def create_df(data_path):
data = pd.DataFrame()
folder_namee = [i for i in data_root.iterdir() if i.is_dir()]
files = [j for i in sku_dirs for j in i.glob('*.jpg')]
data['path'] = [str(i) for i in files]
data['label'] = [i.parts[-2] for i in files]
return data
if __name__ == "__main__":
root = Path('some_path')
data_root = root / 'dataset'
df = create_df(data_root)
for i, row in tqdm(df.iterrows()):
try:
img = PIL.Image.open(row.path)
except Exception:
print(row.path)
print(row)
if os.path.exists(row.path):
os.remove(row.path)
Upvotes: 0
Views: 431
Reputation: 8270
You can use multiprocessing
to parallelize processes. Example:
import os
from PIL import Image
from multiprocessing.pool import ThreadPool
IMAGE_EXT = ('.jpg', '.jpeg', '.png', '.gif')
def check_image(image_path):
try:
Image.open(image_path)
print(f'Image is OK: {image_path}')
except:
os.remove(image_path)
print(f'Image deleted: {image_path}')
def delete_broken_images(root_dir):
pool = ThreadPool(processes=10)
for subdir, dirs, files in os.walk(root_dir):
for file in files:
if file.endswith(IMAGE_EXT):
image_path = os.path.join(subdir, file)
pool.apply_async(check_image, (image_path,)).get()
delete_broken_images(r'c:\so\69805310\images')
Upvotes: 1