Jean Brion
Jean Brion

Reputation: 11

Segmentation fault (core dumped): PyTorch and CLIP

I am trying to run this script using my university gpu cluster (which has cuda), but I get the error below.

/var/spool/gridscheduler/execd/node3b04/job_scripts/44254130: line 54: 29041 Segmentation fault (core dumped) python process_images.py --domain Baby_Products --device cuda:0 > $OUTPUT_DIR/process_images_baby.log 2>&1 /var/spool/gridscheduler/execd/node3b04/job_scripts/44254130: line 55: 29212 Segmentation fault (core dumped) python process_images.py --domain Video_Games --device cuda:0 > $OUTPUT_DIR/process_images_games.log 2>&1

import os
import argparse
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from tqdm import tqdm
import requests
from io import BytesIO
import numpy as np
from datasets import load_dataset
import json
import traceback
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--domain', type=str, required=True, help="Domain to process")
    parser.add_argument('--output_dir', type=str, default='dataset/processed/', help="Output directory for image embeddings")
    parser.add_argument('--image_embedding_dir', type=str, default='image_embeddings', help="Directory to store image embeddings")
    parser.add_argument('--device', type=str, default='cpu', help="Device to use for computation")
    parser.add_argument('--batch_size', type=int, default=16, help="Batch size for processing images")
    return parser.parse_args()

def main():
    args = parse_args()
    
    device = torch.device(args.device)
    logger.info(f"Using device: {device}")

    try:
        # Load the pre-trained CLIP model and processor
        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        model.to(device)
        model.eval()

        # Load the dataset
        dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{args.domain}", split="full", trust_remote_code=True)

        # Load relevant item IDs
        # Use for 5-core filtering
        # relevant_items_path = os.path.join(args.output_dir, args.domain, 'relevant_item_ids.json')
        # with open(relevant_items_path, 'r') as f:
        #     relevant_item_ids = set(json.load(f))

        # item_image_urls = {item['parent_asin']: item['images']['large'] for item in dataset if item['parent_asin'] in relevant_item_ids and len(item['images']['large']) > 0}
        item_image_urls = {item['parent_asin']: item['images']['large'] for item in dataset if len(item['images']['large']) > 0 and item['images']['large'] is not None}

        logger.info(f"Number of items with images: {len(item_image_urls)}")

        # Create the output directory if it doesn't exist
        os.makedirs(args.output_dir, exist_ok=True)
        domain_dir = os.path.join(args.output_dir, args.domain, args.image_embedding_dir)
        os.makedirs(domain_dir, exist_ok=True)

        # Generate CLIP embeddings for the images and save them to disk
        for item_id, image_urls in tqdm(item_image_urls.items(), desc=f"Generating CLIP embeddings for {args.domain}"):
            embedding_path = os.path.join(domain_dir, f"{item_id}.npy")
            
            # Check if the embedding already exists
            if os.path.exists(embedding_path):
                logger.info(f"Embedding for item {item_id} already exists. Skipping.")
                continue

            try:
                all_image_features = []

                for i in range(0, len(image_urls), args.batch_size):
                    batch_urls = image_urls[i:i+args.batch_size]
                    batch_images = []

                    for image_url in batch_urls:
                        logger.info(f"Processing image: {image_url}")
                        response = requests.get(image_url)
                        image = Image.open(BytesIO(response.content)).convert("RGB")
                        batch_images.append(image)

                    inputs = processor(images=batch_images, return_tensors="pt").to(device)

                    with torch.no_grad():
                        image_features = model.get_image_features(**inputs)
                        all_image_features.append(image_features.cpu().numpy())

                    del inputs
                    del image_features
                    del batch_images
                    torch.cuda.empty_cache()

                if all_image_features:
                    aggregated_image_features = np.concatenate(all_image_features, axis=0).mean(axis=0)
                    np.save(embedding_path, aggregated_image_features)
            except Exception as e:
                logger.error(f"Error processing item {item_id}: {str(e)}")
                logger.error(traceback.format_exc())
                continue

    except Exception as e:
        logger.error(f"Error in main function: {str(e)}")
        logger.error(traceback.format_exc())

if __name__ == "__main__":
    main()

When I try running that same script on cpu, everything works fine, but it is too slow. How can I resolve this segmentation fault to be able to work with cuda?

Upvotes: 0

Views: 203

Answers (0)

Related Questions