Reputation: 11
I am trying to run this script using my university gpu cluster (which has cuda), but I get the error below.
/var/spool/gridscheduler/execd/node3b04/job_scripts/44254130: line 54: 29041 Segmentation fault (core dumped) python process_images.py --domain Baby_Products --device cuda:0 > $OUTPUT_DIR/process_images_baby.log 2>&1 /var/spool/gridscheduler/execd/node3b04/job_scripts/44254130: line 55: 29212 Segmentation fault (core dumped) python process_images.py --domain Video_Games --device cuda:0 > $OUTPUT_DIR/process_images_games.log 2>&1
import os
import argparse
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from tqdm import tqdm
import requests
from io import BytesIO
import numpy as np
from datasets import load_dataset
import json
import traceback
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--domain', type=str, required=True, help="Domain to process")
parser.add_argument('--output_dir', type=str, default='dataset/processed/', help="Output directory for image embeddings")
parser.add_argument('--image_embedding_dir', type=str, default='image_embeddings', help="Directory to store image embeddings")
parser.add_argument('--device', type=str, default='cpu', help="Device to use for computation")
parser.add_argument('--batch_size', type=int, default=16, help="Batch size for processing images")
return parser.parse_args()
def main():
args = parse_args()
device = torch.device(args.device)
logger.info(f"Using device: {device}")
try:
# Load the pre-trained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)
model.eval()
# Load the dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{args.domain}", split="full", trust_remote_code=True)
# Load relevant item IDs
# Use for 5-core filtering
# relevant_items_path = os.path.join(args.output_dir, args.domain, 'relevant_item_ids.json')
# with open(relevant_items_path, 'r') as f:
# relevant_item_ids = set(json.load(f))
# item_image_urls = {item['parent_asin']: item['images']['large'] for item in dataset if item['parent_asin'] in relevant_item_ids and len(item['images']['large']) > 0}
item_image_urls = {item['parent_asin']: item['images']['large'] for item in dataset if len(item['images']['large']) > 0 and item['images']['large'] is not None}
logger.info(f"Number of items with images: {len(item_image_urls)}")
# Create the output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
domain_dir = os.path.join(args.output_dir, args.domain, args.image_embedding_dir)
os.makedirs(domain_dir, exist_ok=True)
# Generate CLIP embeddings for the images and save them to disk
for item_id, image_urls in tqdm(item_image_urls.items(), desc=f"Generating CLIP embeddings for {args.domain}"):
embedding_path = os.path.join(domain_dir, f"{item_id}.npy")
# Check if the embedding already exists
if os.path.exists(embedding_path):
logger.info(f"Embedding for item {item_id} already exists. Skipping.")
continue
try:
all_image_features = []
for i in range(0, len(image_urls), args.batch_size):
batch_urls = image_urls[i:i+args.batch_size]
batch_images = []
for image_url in batch_urls:
logger.info(f"Processing image: {image_url}")
response = requests.get(image_url)
image = Image.open(BytesIO(response.content)).convert("RGB")
batch_images.append(image)
inputs = processor(images=batch_images, return_tensors="pt").to(device)
with torch.no_grad():
image_features = model.get_image_features(**inputs)
all_image_features.append(image_features.cpu().numpy())
del inputs
del image_features
del batch_images
torch.cuda.empty_cache()
if all_image_features:
aggregated_image_features = np.concatenate(all_image_features, axis=0).mean(axis=0)
np.save(embedding_path, aggregated_image_features)
except Exception as e:
logger.error(f"Error processing item {item_id}: {str(e)}")
logger.error(traceback.format_exc())
continue
except Exception as e:
logger.error(f"Error in main function: {str(e)}")
logger.error(traceback.format_exc())
if __name__ == "__main__":
main()
When I try running that same script on cpu, everything works fine, but it is too slow. How can I resolve this segmentation fault to be able to work with cuda?
Upvotes: 0
Views: 203