NJS
NJS

Reputation: 506

Why am I getting poor parallel performance in F#

I'm trying to learn F# and this is my first attempt at parallel programming. I'm working on a puzzle to find the longest path through a grid. My pathfinding solution seems like a fairly straightforward recursive algorithm. Then I map/reduce it to find all paths and return the longest.

I have a serial implementation and 3 different parallel implementations of the map/reduce portion. On smaller grids, I see some marginal improvement in speed with the parallel implementations. On a larger grid, the parallel implementations are actually slower! Am I doing something wrong?

The 3 parallel implementations are:

Here are some typical timings for the 4 implementations using different size input grids:

4x4 Grid
GetLongestPath               19.845400
GetLongestPathParallelArray  18.626200
GetLongestPathParallelFor     7.084200
GetLongestPathPSeq          163.271000

5x5 Grid
GetLongestPath              818.967500
GetLongestPathParallelArray 629.563000
GetLongestPathParallelFor   725.072500
GetLongestPathPSeq          772.961300

6x6 Grid
GetLongestPath              3941.354000
GetLongestPathParallelArray 3609.441800
GetLongestPathParallelFor   3509.890500
GetLongestPathPSeq          3295.218600

7x7 Grid
GetLongestPath              24466.655300
GetLongestPathParallelArray 32098.823200
GetLongestPathParallelFor   35274.629500
GetLongestPathPSeq          24980.553600

Here's the code:

module Pathfinder
open System
open System.Threading.Tasks
open Microsoft.FSharp.Collections

let ListContains item list = List.exists (fun x -> x = item) list
let LongestList (x:int list) (y:int list) = if x.Length >= y.Length then x else y

let GetNeighborsNotAlreadyInPath (neighborMap: Map<int, int list>) path =
    neighborMap.[List.head path]
    |> List.filter (fun item -> not (ListContains item path))

let rec GetLongestPathFromAllNeighbors neighborMap currentPath longestPath =
    let neighbors = GetNeighborsNotAlreadyInPath neighborMap currentPath
    if neighbors = [] then
        LongestList currentPath longestPath
    else
        neighbors
        |> List.map (fun neighbor -> GetLongestPathFromAllNeighbors neighborMap (neighbor::currentPath) longestPath)
        |> List.reduce LongestList

let GetLongestPathFromPosition neighborMap i =
    GetLongestPathFromAllNeighbors neighborMap [i] []

let GetLongestPath (neighborMap: Map<int, int list>) =
    [| 0..neighborMap.Count-1 |]
    |> Array.map (fun i -> GetLongestPathFromPosition neighborMap i)
    |> Array.reduce LongestList

let GetLongestPathParallelArray (neighborMap: Map<int, int list>) =
    [| 0..neighborMap.Count-1 |]
    |> Array.Parallel.map (fun i -> GetLongestPathFromPosition neighborMap i)
    |> Array.reduce LongestList

let GetLongestPathParallelFor (neighborMap: Map<int, int list>) =
    let inline ParallelMap (f: 'T -> 'U) (array : 'T[]) : 'U[]=
        let inputLength = array.Length
        let result = Array.zeroCreate inputLength
        Parallel.For(0, inputLength, fun i ->
            result.[i] <- f array.[i]) |> ignore
        result

    [| 0..neighborMap.Count-1 |]
    |> ParallelMap (fun i -> GetLongestPathFromPosition neighborMap i)
    |> Array.reduce LongestList

let GetLongestPathPSeq (neighborMap: Map<int, int list>) =
    [| 0..neighborMap.Count-1 |]
    |> PSeq.map (fun i -> GetLongestPathFromPosition neighborMap i)
    |> PSeq.reduce LongestList

Here is the code to build a map from an input grid:

module Gobstoppers
open System

type GobstopperCollection = { Items: string[]; Width: int; NeighborMap: Map<int, int list> }
type Gobstopper = { Position: int; Color: string; Shape: string; }

let CreateGobstopperFromString (text:string) i =
    { Position = i; Color = text.[0].ToString(); Shape = text.[1].ToString() }

let CreateGobstopper (itemArray: string[]) i =
    CreateGobstopperFromString itemArray.[i] i

let FindNeighbors (itemArray: string[]) rowWidth i =
    let onLeft = (i % rowWidth = 0)
    let onRight = (i % rowWidth = rowWidth - 1)
    let onTop = (i < rowWidth)
    let onBottom = (i >= itemArray.Length - rowWidth)

    [(if onTop || onLeft then -1 else i - rowWidth - 1);
     (if onTop then -1 else i - rowWidth);
     (if onTop || onRight then -1 else i - rowWidth + 1);
     (if onLeft then -1 else i - 1);
     (if onRight then -1 else i + 1);
     (if onBottom || onLeft then -1 else i + rowWidth - 1);
     (if onBottom then -1 else i + rowWidth);
     (if onBottom || onRight then -1 else i + rowWidth + 1)]
    |> List.filter (fun x -> x > -1)

let FindCompatibleNeighbors itemArray rowWidth i =
    let AreCompatible (a:Gobstopper) (b:string) = a.Color = b.[0].ToString() || a.Shape = b.[1].ToString()
    FindNeighbors itemArray rowWidth i
    |> List.map (fun x -> CreateGobstopper itemArray x)
    |> List.filter (fun x -> AreCompatible x itemArray.[i])
    |> List.map (fun x -> x.Position)

let Load (text:string) =
    let itemArray =
        text.Split('|')
        |> Array.map (fun x -> x.Trim())
        |> Array.filter (fun x -> x <> "")
    let rowWidth = int (sqrt (float itemArray.Length))
    let neighborMap = 
        itemArray
        |> Array.mapi (fun i x -> i, FindCompatibleNeighbors itemArray rowWidth i)
        |> Map.ofArray

    { Items = itemArray;
      Width = rowWidth;
      NeighborMap = neighborMap }

Here's the test input:

module TestData

let testGrid3 = "|yr|rr|rs|
                 |yr|gb|rp|
                 |bs|gr|yb|"

let testGrid4 = "|yr|rr|rs|gp|
                 |yr|gb|rp|pp|
                 |bs|gr|yb|bs|
                 |br|rs|yb|bb|"

let testGrid5 = "|yr|rr|rs|gp|rb|
                 |yr|gb|rp|pp|gr|
                 |bs|gr|yb|bs|bp|
                 |br|rs|yb|bb|bc|
                 |gs|yr|yr|rp|br|"

let testGrid6 = "|yr|rr|rs|gp|rb|bc|
                 |yr|gb|rp|pp|gr|pb|
                 |bs|gr|yb|bs|bp|ps|
                 |br|rs|yb|bb|bc|rs|
                 |gs|yr|yr|rp|br|rb|
                 |pp|gr|ps|pb|pr|ps|"

let testGrid7 = "|yr|rr|rs|gp|rb|bc|rb|
                 |yr|gb|rp|pp|gr|pb|rs|
                 |bs|gr|yb|bs|bp|ps|pp|
                 |br|rs|yb|bb|bc|rs|pb|
                 |gs|yr|yr|rp|br|rb|br|
                 |pp|gr|ps|pb|pr|ps|bp|
                 |gc|rb|gs|pp|bc|gb|rp|"

let testGrid8 = "|yr|rr|rs|gp|rb|bc|rb|bp|
                 |yr|gb|rp|pp|gr|pb|rs|rp|
                 |bs|gr|yb|bs|bp|ps|pp|gb|
                 |br|rs|yb|bb|bc|rs|pb|pb|
                 |gs|yr|yr|rp|br|rb|br|pr|
                 |pp|gr|ps|pb|pr|ps|bp|rs|
                 |gc|rb|gs|pp|bc|gb|rp|pp|
                 |rp|gb|rs|ys|yc|yp|rb|bb|"

Here's my console app for timing:

open System
open System.Diagnostics

let RunTimer runCount title testFunc =
    printfn title
    let RunTimedTest n = 
        let stopWatch = Stopwatch.StartNew()
        let result = testFunc()
        stopWatch.Stop()
        printfn "%i - %f" n stopWatch.Elapsed.TotalMilliseconds
        result

    let results = [| 1..runCount |] |> Array.map (fun x -> RunTimedTest x)
    printfn "%A" results.[0]

let runCount = 1
let gobs = Gobstoppers.Load TestData.testGrid6

RunTimer runCount "GetLongestPath" (fun _ -> Pathfinder.GetLongestPath gobs.NeighborMap)
RunTimer runCount "GetLongestPathParallelArray" (fun _ -> Pathfinder.GetLongestPathParallelArray gobs.NeighborMap)
RunTimer runCount "GetLongestPathParallelFor" (fun _ -> Pathfinder.GetLongestPathParallelFor gobs.NeighborMap)
RunTimer runCount "GetLongestPathPSeq" (fun _ -> Pathfinder.GetLongestPathPSeq gobs.NeighborMap)

let line = Console.ReadLine()

Upvotes: 2

Views: 897

Answers (2)

Darren
Darren

Reputation: 577

I've run into a problem before where i think the anonymous function (fun i -> run something)

don't parallelize clealy. I got zero speedup, but writing a helper function

let foo i =
    run i etc

Array.parallel.map (foo) gave me a good speedup. The other comments are relevant though too - really fine grained parallelism often doesn't pay off due to overhead starting up. You might be better having N workers and a shared queue of things to do

Upvotes: 0

Emond
Emond

Reputation: 50672

If the work that is scheduled cannot be distributed in way that really can be executed in parallel, all you add is overhead when you split the work.

If the work can really be executed in parallel across multiple cores, or waiting/idle time can be used to execute a task while waiting, that is when you might gain time.

In this case all you do is calculation so there is no waiting on IO. That is why the code will only benefit from multiple cores (if you keep the synchronisation as low as possible)

Try executing the code on more cores.

Upvotes: 2

Related Questions