Charles Okwuagwu
Charles Okwuagwu

Reputation: 10866

how to replace the nth character in a string with another [ELIXIR/ ERLANG]

How do you perform a String replace_at in Elixir or Erlang?

For example given this fixed width file:

EmployeeFundMappingID EmployeeID  FundID      IsActive EntryDate               ExitDate                ExitTypeID  DateCreated             CreatedByID DateModified            ModifiedByID ConfirmedBy DateConfirmed           GUID                                     IsPooled DatePooled
1                     1118544     1           1        2009-04-20 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2009-04-20 17:48:12.067 NULL                                     NULL     NULL
2                     1027350     1           1        2008-03-03 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2008-05-04 15:13:30.303 NULL                                     NULL     NULL
3                     1024795     1           1        2008-02-29 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2008-05-04 15:13:30.303 NULL                                     NULL     NULL
4                     1116497     1           1        2009-03-24 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2009-03-24 13:00:15.277 NULL                                     NULL     NULL
5                     1116569     1           1        2009-03-24 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2009-03-24 14:43:08.280 NULL                                     NULL     NULL
6                     1116920     1           1        2009-03-27 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2009-03-27 17:16:35.073 NULL                                     NULL     NULL

with col positions at:

[0, 22, 34, 46, 55, 79, 103, 115, 139, 151, 175, 188, 200, 224, 265, 274]

How do we replace \s with \t at each col positions?

I'm effectively trying to convert a Fixed-Width file into a csv

Upvotes: 1

Views: 1072

Answers (3)

Charles Okwuagwu
Charles Okwuagwu

Reputation: 10866

Comparing two implementations on a dataset with over 16M rows:

  def flat2csv1(src, dst) do
    Logger.info("START")

    t = System.system_time(:millisecond)

    funs =
      [12, 52, 76]
      |> Enum.map(&(&1 - 1))
      |> Enum.map(fn len ->
        fn <<s::binary-size(len), " ", rest::binary>> ->
          s <> "\t" <> rest
        end
      end)

    File.stream!(src)
    |> Enum.map(fn line ->
      Enum.reduce(funs, line, fn fun, acc -> fun.(acc) end)
    end)
    |> write(dst)

    log_elapsed("DONE", t)
  end

  def flat2csv0(src, dst) do
    Logger.info("START")

    t = System.system_time(:millisecond)

    File.stream!(src)
    |> Enum.map(fn <<
                     c1::binary-size(11),
                     " ",
                     c2::binary-size(39),
                     " ",
                     c3::binary-size(23),
                     " ",
                     ce::binary
                   >> ->
      c1 <> "\t" <> c2 <> "\t" <> c3 <> "\t" <> ce
    end)
    |> write(dst)

    log_elapsed("DONE", t)
  end

  defp log_elapsed(s, t) do
    t = System.system_time(:millisecond) - t
    Logger.debug("#{s}: #{t} ms")
  end

  defp write(s, dst) do
    File.write!(dst, s, [:append])
  end

Results

# flat2csv0
11:40:25.055 [info] START
11:42:26.028 [info] DONE: 120969 ms

# flat2csv1
11:45:17.521 [info] START
11:48:25.433 [info] DONE: 187906 ms

Upvotes: 0

Aleksei Matiushkin
Aleksei Matiushkin

Reputation: 121000

I would go with reducing the original lines with a set of functions changing respective positions in the string.

funs =
  [22, 34, 46, 55, 79, 103, 115, 139, 151, 175, 188, 200, 224, 265, 274]
  |> Enum.map(& &1 - 1)
  |> Enum.map(fn len ->
       fn <<s :: binary-size(len), " ", rest :: binary>> ->
         s <> "\t" <> rest
       end
     end)

input
|> String.trim
|> String.split("\n")
|> Enum.map(fn line ->
     Enum.reduce(funs, line, fn fun, acc -> fun.(acc) end)
   end)

That might be done in a more elegant way using generated macros, once per a position, and recursive calls, but reducing on function list looks more straightforward to me here.


The advantage of this approach would be that it immediately fails on any inconsistent data, assuring (more or less) that if it passed, the conversion was done properly, unlike all others shorter solutions.

Also it’s drastically faster than any Regex solution.


Since this is to be applied to 16M rows, here is the probably most performant version, that matches the whole row at once:

input
|> String.trim
|> String.split("\n")
|> Enum.map(
     # [22, 34, 46, 55, 79, 103,
     #  115, 139, 151, 175, 188,
     #  200, 224, 265, 274]
     # note: this assumes the listed positions above are 1-based
     fn <<
        c1 :: binary-size(21),
        " ",
        c2 :: binary-size(11),
        " ",
        c3 :: binary-size(11),
        " ",
        c4 :: binary-size(8),
        " ",
        c5 :: binary-size(23),
        " ",
        c6 :: binary-size(23),
        " ",
        c7 :: binary-size(11),
        " ",
        c8 :: binary-size(23),
        " ",
        c9 :: binary-size(11),
        " ",
        c10 :: binary-size(23),
        " ",
        c11 :: binary-size(12),
        " ",
        c12 :: binary-size(11),
        " ",
        c13 :: binary-size(23),
        " ",
        c14 :: binary-size(40),
        " ",
        c15 :: binary-size(8),
        " ",
        c16 :: binary
        >> ->
     c1 <> "\t" <> 
       c2 <> "\t" <> 
       c3 <> "\t" <> 
       c4 <> "\t" <> 
       c5 <> "\t" <> 
       c6 <> "\t" <> 
       c7 <> "\t" <> 
       c8 <> "\t" <> 
       c9 <> "\t" <> 
       c10 <> "\t" <> 
       c11 <> "\t" <> 
       c12 <> "\t" <> 
       c13 <> "\t" <> 
       c14 <> "\t" <> 
       c15 <> "\t" <> 
       c16
   end)

Upvotes: 3

Abhyudit Jain
Abhyudit Jain

Reputation: 3748

What you can do is join the date-time first and then replace all the spaces with commas and then revert the date-time to original format:

data
|> String.replace(~r/(-\d+)([\s]{1})(\d+)/, "\\1T\\3")
|> String.replace(~r/ +/, ",")
|> String.replace(~r/(\d)(T)(\d)/, "\\1 \\3")

Upvotes: 1

Related Questions