Reputation: 33
I have a Polars dataframe in which the cells contain a sequence of single digits as a string of characters, and I want to find the number of differences between the elements of the string. For example:
df = pl.DataFrame({"pop_1": ["100","0021"],"pop_2":["11002","0000",]})
| pop_1 | pop_2 |
|_______|_________|
| "100" | "11002" |
| "0021"| "0000" |
col_1 row1 has 2 differences with itself (1 different than 0 two times); col_2 row1 has 8 differences with itself; col_1 and col_2 have 9 differences between them at row1.
The naive implementation of this would be:
def get_dxy(str1,str2):
diffs = 0
for x in str1:
for y in str2:
if x!=y:
diffs+=1
return diffs
def get_pi(str1):
diffs = 0
for i in range(len(str1)-1):
for j in range(i+1,len(str1)):
if str1[i]!=str1[j]:
diffs+=1
return diffs
I need to report these differences in separate columns. I am able to do this by using map_elements at each row:
df = df.with_columns( pl.col('pop_1')
.map_elements(get_pi, return_dtype=pl.Int64)
.alias('pi_1')
)
df = df.with_columns( pl.col('pop_2')
.map_elements(get_pi, return_dtype=pl.Int64)
.alias('pi_2')
)
df = df.with_columns(
pl.struct("pop_1", "pop_2")
.map_elements(lambda cols: get_dxy(cols["pop_1"], cols["pop_2"]), return_dtype=pl.Int64)
.alias("dxy")
)
df
| pop_1 | pop_2 | pi_1 | pi_2 | dxy |
|_______|_________|_______|_________|_______|
| "100" | "11002" | 2 | 8 | 9 |
| "0021"| "0000" | 5 | 0 | 8 |
However, my data is too large, and this method is not very fast. I was wondering what the fastest way to accomplish this using Polars built-in tools would be (perhaps without using map_elements?)
Could I get some hints in how to do this?
Upvotes: 1
Views: 185
Reputation: 21580
It is possible to implement similar logic using "native" Polars operations.
If you reshape with .unpivot()
you could generate the combination of string slices with:
(df.with_row_index()
.unpivot(index="index")
.with_columns(str_len = pl.col.value.str.len_chars())
.with_columns(char_id = pl.int_ranges(1, pl.col.str_len))
.explode("char_id")
)
shape: (12, 5)
┌───────┬──────────┬───────┬─────────┬─────────┐
│ index ┆ variable ┆ value ┆ str_len ┆ char_id │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str ┆ u32 ┆ i64 │
╞═══════╪══════════╪═══════╪═════════╪═════════╡
│ 0 ┆ pop_1 ┆ 100 ┆ 3 ┆ 1 │
│ 0 ┆ pop_1 ┆ 100 ┆ 3 ┆ 2 │
│ 1 ┆ pop_1 ┆ 0021 ┆ 4 ┆ 1 │
│ 1 ┆ pop_1 ┆ 0021 ┆ 4 ┆ 2 │
│ 1 ┆ pop_1 ┆ 0021 ┆ 4 ┆ 3 │
│ … ┆ … ┆ … ┆ … ┆ … │
│ 0 ┆ pop_2 ┆ 11002 ┆ 5 ┆ 3 │
│ 0 ┆ pop_2 ┆ 11002 ┆ 5 ┆ 4 │
│ 1 ┆ pop_2 ┆ 0000 ┆ 4 ┆ 1 │
│ 1 ┆ pop_2 ┆ 0000 ┆ 4 ┆ 2 │
│ 1 ┆ pop_2 ┆ 0000 ┆ 4 ┆ 3 │
└───────┴──────────┴───────┴─────────┴─────────┘
You can .str.slice()
out the first char and the "rest of the string".
.str.count_matches()
and some arithmetic can then be used to count the number of diffs.
(df.with_row_index()
.unpivot(index="index")
.with_columns(str_len = pl.col.value.str.len_chars())
.with_columns(char_id = pl.int_ranges(1, pl.col.str_len))
.explode("char_id")
.with_columns(
(pl.col.str_len - pl.col.char_id)
- pl.col.value.str.slice(pl.col.char_id)
.str.count_matches(pl.col.value.str.slice(pl.col.char_id - 1, 1))
)
.group_by("index", "variable")
.agg(pl.col.str_len.sum())
.with_columns(pl.col.variable.str.replace(r"^pop", "pi"))
.pivot(on="variable", index="index")
)
shape: (2, 3)
┌───────┬──────┬──────┐
│ index ┆ pi_1 ┆ pi_2 │
│ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 │
╞═══════╪══════╪══════╡
│ 1 ┆ 5 ┆ 0 │
│ 0 ┆ 2 ┆ 8 │
└───────┴──────┴──────┘
A similar approach can be used for the dxy
count.
If you use .str.split("")
to turn one of the strings into a list, then explode.
(df.with_row_index()
.with_columns(pl.col.pop_1.str.split(""))
.explode("pop_1")
)
shape: (7, 3)
┌───────┬───────┬───────┐
│ index ┆ pop_1 ┆ pop_2 │
│ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str │
╞═══════╪═══════╪═══════╡
│ 0 ┆ 1 ┆ 11002 │
│ 0 ┆ 0 ┆ 11002 │
│ 0 ┆ 0 ┆ 11002 │
│ 1 ┆ 0 ┆ 0000 │
│ 1 ┆ 0 ┆ 0000 │
│ 1 ┆ 2 ┆ 0000 │
│ 1 ┆ 1 ┆ 0000 │
└───────┴───────┴───────┘
The number of diffs is then just .len_chars()
- .count_matches()
(df.with_row_index()
.with_columns(pl.col.pop_1.str.split(""))
.explode("pop_1")
.with_columns(dxy =
pl.col.pop_2.str.len_chars()
- pl.col.pop_2.str.count_matches(pl.col.pop_1)
)
.group_by("index")
.agg(pl.col.dxy.sum())
)
shape: (2, 2)
┌───────┬─────┐
│ index ┆ dxy │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞═══════╪═════╡
│ 0 ┆ 9 │
│ 1 ┆ 8 │
└───────┴─────┘
You could combine into a single frame using joins.
(df.with_row_index()
.join(df_pi.join(df_dxy, on="index"), on="index", how="left")
)
Upvotes: 1
Reputation: 1750
These should be faster, specially for long strings.
def get_pi_2(string):
diffs = 0
string_length = len(string)
chars = set(string)
for char in chars:
reps = string.count(char)
diffs += (string_length - reps) * reps
return diffs // 2 # differences are counted twice
def get_dxy_2(str1, str2):
diffs = 0
str2_length = len(str2)
chars = set(str1)
for char in chars:
diffs += str2_length - str1.count(char)
return diffs
list(set(...))
could be used instead of just sets to get a small boost in the for loop, but because there are few elements (only numbers and letters), the difference may not justify that additional conversion.
Upvotes: 1