Reputation: 173
I am learning parallel programming in Haskell using Simon Marlow's book. In the chapter about parallelizing Sudoku solvers, I decided to write my own solver using backtracking algorithm. The problem is that there is almost no performance gain when I try to distribute 6 cases among 6 cores. When I try to do examples with more cases, I get more significant performance gains yet still far from theoretical maximum which should be between 5 and 6. I understand that some cases may run far slower, but the threadscope diagram shows no excuse for such little gain. Can someone explain me what I am doing wrong. Maybe there is something about ST threads which I am not understanding?
Here is the code:
Sudoku.hs
{-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
module Sudoku (getSudokus, solve) where
import Data.Vector(Vector, (!), generate, thaw, freeze)
import Data.List ( nub )
import qualified Data.Vector.Mutable as MV
import Text.Trifecta
import Control.Monad ( replicateM, when )
import Control.Applicative ((<|>))
import Control.Monad.ST
import Control.DeepSeq (NFData)
import GHC.Generics (Generic)
data Cell = Given Int
| Filled Int
| Empty
deriving (Generic, NFData)
newtype Sudoku = Sudoku (Vector Cell)
deriving (Generic, NFData)
instance Show Cell where
show Empty = " "
show (Filled x) = " " ++ show x ++ " "
show (Given x) = "[" ++ show x ++ "]"
instance Show Sudoku where
show (Sudoku vc) = "\n" ++
"+ - - - + - - - + - - - +" ++ "\n" ++
"|" ++ i 0 ++ i 1 ++ i 2 ++ "|" ++ i 3 ++ i 4 ++ i 5 ++ "|" ++ i 6 ++ i 7 ++ i 8 ++ "|" ++ "\n" ++
"|" ++ i 9 ++ i 10 ++ i 11 ++ "|" ++ i 12 ++ i 13 ++ i 14 ++ "|" ++ i 15 ++ i 16 ++ i 17 ++ "|" ++ "\n" ++
"|" ++ i 18 ++ i 19 ++ i 20 ++ "|" ++ i 21 ++ i 22 ++ i 23 ++ "|" ++ i 24 ++ i 25 ++ i 26 ++ "|" ++ "\n" ++
"+ - - - + - - - + - - - +" ++ "\n" ++
"|" ++ i 27 ++ i 28 ++ i 29 ++ "|" ++ i 30 ++ i 31 ++ i 32 ++ "|" ++ i 33 ++ i 34 ++ i 35 ++ "|" ++ "\n" ++
"|" ++ i 36 ++ i 37 ++ i 38 ++ "|" ++ i 39 ++ i 40 ++ i 41 ++ "|" ++ i 42 ++ i 43 ++ i 44 ++ "|" ++ "\n" ++
"|" ++ i 45 ++ i 46 ++ i 47 ++ "|" ++ i 48 ++ i 49 ++ i 50 ++ "|" ++ i 51 ++ i 52 ++ i 53 ++ "|" ++ "\n" ++
"+ - - - + - - - + - - - +" ++ "\n" ++
"|" ++ i 54 ++ i 55 ++ i 56 ++ "|" ++ i 57 ++ i 58 ++ i 59 ++ "|" ++ i 60 ++ i 61 ++ i 62 ++ "|" ++ "\n" ++
"|" ++ i 63 ++ i 64 ++ i 65 ++ "|" ++ i 66 ++ i 67 ++ i 68 ++ "|" ++ i 69 ++ i 70 ++ i 71 ++ "|" ++ "\n" ++
"|" ++ i 72 ++ i 73 ++ i 74 ++ "|" ++ i 75 ++ i 76 ++ i 77 ++ "|" ++ i 78 ++ i 79 ++ i 80 ++ "|" ++ "\n" ++
"+ - - - + - - - + - - - +" ++ "\n"
where i x = show (vc ! x)
parseSudoku :: Parser Sudoku
parseSudoku = do
lst <- replicateM 81 field
(newline *> return ()) <|> eof
return $ Sudoku $ generate 81 (lst !!)
where field = (char '.' >> return Empty) <|> (Given . read . return <$> digit)
getSudokus :: String -> Maybe [Sudoku]
getSudokus raw = case parseString (some parseSudoku) mempty raw of
Success ss -> Just ss
Failure _ -> Nothing
data Direction = Back | Forward
solve :: Sudoku -> Maybe Sudoku
solve sudoku@(Sudoku puzzle) = if isValid sudoku then
Just $ runST $ do
puzzle' <- thaw puzzle
go puzzle' 0 Forward
Sudoku <$> freeze puzzle'
else Nothing
where go _ 81 _ = return ()
go vector position direction = do
cell <- MV.read vector position
case (cell, direction) of
(Empty, Back) -> error "Calling back Empty cell, this should not ever occur"
(Empty, Forward) -> MV.write vector position (Filled 1) >> go vector position Forward
(Given _, Back) -> go vector (position-1) Back
(Given _, Forward) -> go vector (position+1) Forward
(Filled 10, Back) -> MV.write vector position Empty >> go vector (position-1) Back
(Filled 10, Forward) -> go vector position Back
(Filled x, Forward) -> do
let (r, c, s) = calculatePositions position
row <- getRowMV r vector
col <- getColumnMV c vector
sqr <- getSquareMV s vector
if isUnique row && isUnique col && isUnique sqr
then go vector (position+1) Forward
else MV.write vector position (Filled (x+1)) >> go vector position Forward
(Filled x, Back) -> MV.write vector position (Filled (x+1)) >> go vector position Forward
calculatePositions :: Int -> (Int, Int, Int)
calculatePositions i = let (row, col) = divMod i 9
sqr = (row `div` 3)*3 + (col `div` 3)
in (row, col, sqr)
isValid :: Sudoku -> Bool
isValid sudoku = go 0
where go 9 = True
go i = isUnique (getRow i sudoku) && isUnique (getColumn i sudoku) && isUnique (getSquare i sudoku) && go (i+1)
getRow :: Int -> Sudoku -> [Cell]
getRow l (Sudoku vector) = go 0
where go 9 = []
go c = vector ! (l*9 + c) : go (c+1)
getRowMV :: MV.PrimMonad m => Int -> MV.MVector (MV.PrimState m) Cell -> m [Cell]
getRowMV l mv = go 0
where go 9 = return []
go c = do
n <- MV.read mv (l*9 + c)
rl <- go (c+1)
return (n:rl)
getColumn :: Int -> Sudoku -> [Cell]
getColumn c (Sudoku vector) = go 0
where go 9 = []
go i = vector ! (c + i*9) : go (i+1)
getColumnMV :: MV.PrimMonad m => Int -> MV.MVector (MV.PrimState m) Cell -> m [Cell]
getColumnMV c mv = go 0
where go 9 = return []
go i = do
n <- MV.read mv (c + i*9)
rl <- go (i+1)
return (n:rl)
getSquare :: Int -> Sudoku -> [Cell]
getSquare q (Sudoku vector) = let (y, x) = quotRem q 3
start = x*3 + y*3*9
in [ vector ! start, vector ! (start + 1), vector ! (start + 2)
, vector ! (start + 9), vector ! (start + 10), vector ! (start + 11)
, vector ! (start + 18), vector ! (start + 19), vector ! (start + 20)]
getSquareMV :: MV.PrimMonad m => Int -> MV.MVector (MV.PrimState m) a -> m [a]
getSquareMV q mv = let (y, x) = quotRem q 3
start = x*3 + y*3*9
in do
a1 <- MV.read mv start
a2 <- MV.read mv (start + 1)
a3 <- MV.read mv (start + 2)
b1 <- MV.read mv (start + 9)
b2 <- MV.read mv (start + 10)
b3 <- MV.read mv (start + 11)
c1 <- MV.read mv (start + 18)
c2 <- MV.read mv (start + 19)
c3 <- MV.read mv (start + 20)
return [a1,a2,a3,b1,b2,b3,c1,c2,c3]
isUnique :: [Cell] -> Bool
isUnique xs = let sv = strip xs
in length sv == length (nub sv)
where strip (Empty:xs) = strip xs
strip ((Given x):xs) = x : strip xs
strip ((Filled x):xs) = x : strip xs
strip [] = []
Main.hs
module Main where
import Control.Parallel.Strategies
import Control.Monad
import Control.DeepSeq ( force )
import Sudoku
import System.Environment (getArgs)
main :: IO ()
main = do
filename <- head <$> getArgs
contents <- readFile filename
case getSudokus contents of
Just sudokus -> print $ runEval $ do
start <- forM sudokus (rpar . force . solve)
forM start rseq
Nothing -> putStrLn "Error during parsing"
I am compiling it with following flags:
ghc-options: -O2 -rtsopts -threaded -eventlog
Execution with following flags:
cabal exec sudoku -- sudoku17.6.txt +RTS -N1 -s -l
Gives following performance report and threadscope diagram:
950,178,477,200 bytes allocated in the heap
181,465,696 bytes copied during GC
121,832 bytes maximum residency (7 sample(s))
30,144 bytes maximum slop
7 MiB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 227776 colls, 0 par 1.454s 1.633s 0.0000s 0.0011s
Gen 1 7 colls, 0 par 0.001s 0.001s 0.0001s 0.0002s
TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1)
SPARKS: 6 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 6 fizzled)
INIT time 0.001s ( 0.001s elapsed)
MUT time 220.452s (220.037s elapsed)
GC time 1.455s ( 1.634s elapsed)
EXIT time 0.000s ( 0.008s elapsed)
Total time 221.908s (221.681s elapsed)
Alloc rate 4,310,140,685 bytes per MUT second
Productivity 99.3% of total user, 99.3% of total elapsed
Execution with parallelization:
cabal exec sudoku -- sudoku17.6.txt +RTS -N6 -s -l
950,178,549,616 bytes allocated in the heap
325,450,104 bytes copied during GC
142,704 bytes maximum residency (7 sample(s))
82,088 bytes maximum slop
32 MiB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 128677 colls, 128677 par 37.697s 30.612s 0.0002s 0.0035s
Gen 1 7 colls, 6 par 0.005s 0.004s 0.0006s 0.0012s
Parallel GC work balance: 11.66% (serial 0%, perfect 100%)
TASKS: 14 (1 bound, 13 peak workers (13 total), using -N6)
SPARKS: 6 (5 converted, 0 overflowed, 0 dud, 0 GC'd, 1 fizzled)
INIT time 0.010s ( 0.009s elapsed)
MUT time 355.227s (184.035s elapsed)
GC time 37.702s ( 30.616s elapsed)
EXIT time 0.001s ( 0.007s elapsed)
Total time 392.940s (214.667s elapsed)
Alloc rate 2,674,847,755 bytes per MUT second
Productivity 90.4% of total user, 85.7% of total elapsed
Here are the contents of sudoku17.6.txt:
.......2143.......6........2.15..........637...........68...4.....23........7....
.......241..8.............3...4..5..7.....1......3.......51.6....2....5..3...7...
.......24....1...........8.3.7...1..1..8..5.....2......2.4...6.5...7.3...........
.......23.1..4....5........1.....4.....2...8....8.3.......5.16..4....7....3......
.......21...5...3.4..6.........21...8.......75.....6.....4..8...1..7.....3.......
.......215.3......6...........1.4.6.7.....5.....2........48.3...1..7....2........
Upvotes: 5
Views: 263
Reputation: 9767
Believe it or not, but your problem potentially had nothing to do with parallelization. In the future I'd recommend you first look at the input to the function you are trying to parallelized. It turned out you always tried a single puzzle.
Edit - @Noughtmare pointed out that according to Threadscope results posted in the question there is some parallelization going on. Which is true and it makes me believe that the file posted in question doesn't exactly match the one used for creating the results. If that's the case, then you can skip to Parallelization section for the answer about: "Why parallelizing this code yields almost no performance improvement on six core machine?"
Long story short there is a bug in your parser. If you ask my true opinion, it is actually a bug in trifecta
package documentation, because it promises to fully consume the input parseString
:
Fully parse a String to a Result.
but instead it consumes the first line only and successfully returns the result. However, honestly, I've never used it before, so maybe it is the expected bahavior.
Lets take a look at your parser:
parseSudoku :: Parser Sudoku
parseSudoku = do
lst <- replicateM 81 field
(newline *> return ()) <|> eof
return $ Sudoku $ generate 81 (lst !!)
where
field = (char '.' >> return Empty) <|> (Given . read . return <$> digit)
At first glance it looks just fine, until input is closely examined. Every empty line between the lines with data also contain a newline character, but your parser expects one at most:
.......2143.......6........2.15..........637...........68...4.....23........7....
<this is also a newline>
.......241..8.............3...4..5..7.....1......3.......51.6....2....5..3...7...
So you parser should instead be:
many (newline *> return ()) <|> eof
Side note. If it was up to me this is how I would write the parser:
parseSudoku :: Parser Sudoku
parseSudoku = do
(Sudoku <$> V.replicateM 81 field) <* ((() <$ many newline) <|> eof)
where
field = (Empty <$ char '.') <|> (Given . Data.Char.digitToInt <$> digit)
When it comes to implementation of parallelization it seems to work fine, but the problem is the work load is really unbalanced. That's why there is only about x2 speed up when using 6 cores. In other words not all puzzles are created equally hard. For that reason solving 6 puzzles using 6 cores in parallel will always get the performance of the longest solution at best. Therefore to gain more from parallelization you either need more puzzles or less CPU cores ;)
EDIT: Here are some benchmarks to support my explanation above.
These are the results for solving each individual puzzle:
And these two are the sequential and parallelized solvers using one core and six cores respectfully.
As you can see solving the second puzzle with index 1
took the longest time, which on my computer took a little over a 100 seconds. This is also the time it took for the parallelized algorithm to solve all puzzles. Which makes sense, since all other 5 puzzles were solved much quicker and those cores that were freed up had no other work to do.
Also as a sanity check if you sum up the individual times it took for puzzles to be solved it will match up pretty good with the total time it took to solve all of them sequentially.
Upvotes: 4