Kasia Gogolek
Kasia Gogolek

Reputation: 3414

bash: running cURLs in parallel slower than one after another

We have to cache quite a big database of data after each upload, so we created a bash script that should handle it for us. The script should start 4 paralel curls to the site and once they're done, start the next one from the URL list we store in the file.

In theory everything works ok, and the concept works if we run the run 4 processes from our local machines to the target site.

If i set the MAX_NPROC=1 the curl takes as long as it would if the browser hits the URL i.e. 20s If I set the MAX_NPROC=2 the time request took, triples.

Am I missing something? Is that an apache setting that is slowing us down? or is this a secret cURL setting that I'm missing?

Any help will be appreciated. Please find the bash script below

#!/bin/bash

if [[ -z $2 ]]; then
    MAX_NPROC=4 # default
else
    MAX_NPROC=$2
fi

if [[ -z $1 ]]; then
    echo "File with URLs is missing"
    exit
fi;

NUM=0
QUEUE=""

DATA=""
URL=""

declare -a URL_ARRAY
declare -a TIME_ARRAY
ERROR_LOG=""

function queue {
    QUEUE="$QUEUE $1"
    NUM=$(($NUM+1))
}

function regeneratequeue {
    OLDREQUEUE=$QUEUE

    echo "OLDREQUEUE:$OLDREQUEUE"


    QUEUE=""
    NUM=0
    for PID in $OLDREQUEUE
    do
        process_count=`ps ax | awk '{print $1 }' | grep -c "^${PID}$"`

        if [ $process_count -eq 1 ] ; then
            QUEUE="$QUEUE $PID"
            NUM=$(($NUM+1))
        fi
    done

}

function checkqueue {
    OLDCHQUEUE=$QUEUE
    for PID in $OLDCHQUEUE
    do

        process_count=`ps ax | awk '{print $1 }' | grep -c "^${PID}$"`

        if [ $process_count -eq 0 ] ; then
            wait $PID
            my_status=$?
            if [[ $my_status -ne 0 ]]
            then
                echo "`date` $my_status ${URL_ARRAY[$PID]}" >> $ERROR_LOG
            fi

            current_time=`date +%s`
            old_time=${TIME_ARRAY[$PID]}
            time_difference=$(expr $current_time - $old_time)

            echo "`date` ${URL_ARRAY[$PID]} END ($time_difference seconds)" >> $REVERSE_LOG

            #unset TIME_ARRAY[$PID]
            #unset URL_ARRAY[$PID] 

            regeneratequeue # at least one PID has finished
            break
        fi
    done
}

REVERSE_LOG="$1.rvrs"
ERROR_LOG="$1.error"

echo "Cache STARTED at `date`" > $REVERSE_LOG
echo "" > ERROR_LOG

while read line; do

    # create the command to be run
    DATA="[email protected]&password=password"
    URL=$line
    CMD=$(curl --data "${DATA}" -s -o /dev/null --url "${URL}")

    echo "Command: ${CMD}"    
    # Run the command
    $CMD &
    # Get PID for process
    PID=$!
    queue $PID;
    URL_ARRAY[$PID]=$URL;
    TIME_ARRAY[$PID]=`date +%s`
    while [ $NUM -ge $MAX_NPROC ]; do
         checkqueue
         sleep 0.4
     done
done < $1
echo "Cache FINISHED at `date`" >> $REVERSE_LOG
exit

Upvotes: 0

Views: 543

Answers (1)

pizza
pizza

Reputation: 7630

The network is almost always the bottleneck. Spawning more connections usually makes it slower.

You can try to see if parallel'izing it will do you any good by spawning several

time curl ...... &

Upvotes: 1

Related Questions