one-t
one-t

Reputation: 333

Getting a list of lists without using List<List<string>> in F#

I have this function here:

let ProcessFile (allLines: string list) = 
    let list = new List<List<string>>()

    let rec SplitFile (input: string list) =
        if input.Length <> 0 then
            list.Add(new List<string>(input.TakeWhile(fun x -> x <> "")))
            let nextGroup = input.SkipWhile(fun x -> x <> "").SkipWhile(fun x -> x = "")
            SplitFile (Seq.toList nextGroup)

    SplitFile allLines |> ignore
    list

It is given the contents of a file as a list of strings and takes each group that is seperated by empty lines as a seperate list, giving me a list of lists.

My question is, is there a better way to do this with a yield that give me something like a string list list instead of me having to use new List< List< string>>? As this doesn't seem particularly neat to me.

Upvotes: 4

Views: 1512

Answers (4)

Cesar Mendoza
Cesar Mendoza

Reputation: 131

How about using plain old List.fold

let processFile lines =
([], lines) ||>
List.fold(fun acc l -> 
    match acc with
        | [] when l = "" -> acc        // filter empty lines at the start of the file
        | [] -> [[l]]                  // start the first group
        | []::xss when l = "" -> acc   // filter continous empty lines
        | xs::xss when l = "" ->       // found an empty line, start a new group
            let rxs = List.rev xs      // reverse the current group before starting a new one
            []::rxs::xss
        | xs::xss -> (l::xs)::xss)     // continue adding to the current group
|> List.rev

Upvotes: 0

J D
J D

Reputation: 48687

A more idiomatic solution might be:

let processFile xs =
  let rec nonEmpties n = function
    | [] as xs | ""::xs -> n, xs
    | _::xs -> nonEmpties (n+1) xs
  let rec loop xs =
    seq { match xs with
          | [] -> ()
          | ""::xs -> yield! loop xs
          | xs ->
              let n, ys = nonEmpties 0 xs
              yield Seq.take n xs
              yield! loop ys }
  loop xs

where the nested nonEmpties function counts how many non-empty elements are at the front of the given list and returns both the count and the tail list after the last non-empty element, and the loop function skips empty elements and yields sequences of non-empty elements.

Some interesting characteristics of this solution:

  • Fully tail recursive so it can handle arbitrarily long sequences of non-empty strings and sequences of sequences of non-empty strings.

  • Avoids copying by referring back to the input list.

On test input of 1,000 sequences of 1,000 strings this solution is 8x faster than yamen's and 50% faster than Tomas'.

Here is an even faster solution that begins by converting the input list into an array and then acts upon array indices:

let processFile xs =
  let xs = Array.ofSeq xs
  let rec nonEmpties i =
    if i=xs.Length || xs.[i]="" then i else
      nonEmpties (i+1)
  let rec loop i =
    seq { if i < xs.Length then
            if xs.[i] = "" then
              yield! loop (i+1)
            else
              let j = nonEmpties i
              yield Array.sub xs i (j - i)
              yield! loop j }
  loop 0

On test input of 1,000 sequences of 1,000 strings this solution is 34x faster than yamen's and 6x faster than Tomas'.

Upvotes: 5

yamen
yamen

Reputation: 15618

Personally, I like one liners:

let source = ["a"; "b"; ""; ""; "c"; ""; "d"]

source                                                                       // can be any enumerable or seq
|> Seq.scan (fun (i, _) e -> if e = "" then (i + 1, e) else (i, e)) (0, "")  // add the 'index'
|> Seq.filter (fun (_, e) -> e <> "")                                        // remove the empty entries
|> Seq.groupBy fst                                                           // group by the index
|> Seq.map (fun (_, l) -> l |> Seq.map snd |> List.ofSeq)                    // extract the list only from each group (discard the index)
|> List.ofSeq                                                                // turn back into a list                      

The biggest problem here is that Seq.groupBy will read the entire list into memory, but you're doing that anyway. There are implementations of groupBy that will only look at adjacent entries and that would be sufficient, and will let you input the file as a Seq instead (for example, using File.ReadLines rather than File.ReadAllLines).

Upvotes: 0

Tomas Petricek
Tomas Petricek

Reputation: 243041

Your code is quite readable to me, but using TakeWhile and SkipWhile recursively is fairly inefficient. Here is a simple functional recursive solution:

let ProcessFile (allLines: string list) =
  // Recursively processes 'input' and keeps the list of 'groups' collected
  // so far. We keep elements of the currently generated group in 'current'  
  let rec SplitFile input groups current = 
    match input with 
    // Current line is empty and there was some previous group
    // Add the current group to the list of groups and continue with empty current
    | ""::xs when current <> [] -> SplitFile xs ((List.rev current)::groups) []
    // Current line is empty, but there was no previous group - skip & continue
    | ""::xs -> SplitFile xs groups []
    // Current line is non-empty - add it to the current group
    | x::xs -> SplitFile xs groups (x::current)
    // We reached the end - add current group if it is not empty
    | [] when current <> [] -> List.rev ((List.rev current)::groups)
    | [] -> List.rev groups

  SplitFile allLines  [] []

ProcessFile ["a"; "b"; ""; ""; "c"; ""; "d"]

Essenitally the same code can be written using seq { ... } as follows. We still need to keep the list of current groups using an accumulator (current) but we now return groups lazily using yield and yield! as we iterate over the inputs:

let ProcessFile (allLines: string list) =  
  let rec SplitFile input current = seq {
    match input with 
    | ""::xs when current <> [] -> 
        yield List.rev current
        yield! SplitFile xs []
    | ""::xs -> 
        yield! SplitFile xs []
    | x::xs -> 
        yield! SplitFile xs (x::current)
    | [] when current <> [] -> 
        yield List.rev current
    | [] -> () }

  SplitFile allLines []

Upvotes: 2

Related Questions