Reputation: 4065
I am attempting to write a function that generates every possible pronunciation permutation of a word using a pronunciation dictionary.
# Dictionary
sounddef <- t(matrix(strsplit('c,k,c,s,ou,uh,n,n,t,t,r,r,y,ee,w,w,o,oh,o,uh,th,th,s,s,m,m',",")[[1]], nrow = 2))
# The first column is the written letter and the second is a possible pronunciation.
match_rec <- function(x, sounddef) {
if (!nzchar(x)) return("")
returner <- NULL
for (i in 1:nrow(sounddef)) {
v <- sounddef[i,]
char <- paste0("^",v[1])
if (grepl(char, x))
returner <- c(returner, paste0(v[1],'->',v[2], ",",
match_rec(gsub(char, "", x), sounddef), collapse=""))
}
returner
}
# Unfortunately this does not return the right values
match_rec("country", sounddef)
[1] "c->k,ou->uh,n->n,t->t,r->r,y->ee,c->k,o->oh,c->k,o->uh,"
[2] "c->s,ou->uh,n->n,t->t,r->r,y->ee,c->s,o->oh,c->s,o->uh,"
The values it was supposed to return are:
[1] "c->k,ou->uh,n->n,t->t,r->r,y->ee"
[2] "c->s,ou->uh,n->n,t->t,r->r,y->ee"
As there are two possible ways to pronounce c in the dictionary.
Upvotes: 0
Views: 76
Reputation: 4065
I ended up trying something different as well. It is a less efficient a solution than what you came up with, so I will only post this in case someone else wants to reference.
match_rec2 <- function(x, sounddef) {
# Reduce sound dictionary to only possibly used sounds
sr <- sounddef %>% subset(sapply(sounddef[,1], function(x) x %>% grepl(x)))
# Loop through each character then each row in the dictionary
for (i in 1:nchar(myword)) for (ii in 1:nrow(sr))
x <- unique(c(x, str_replace(x, sr[ii,1], toupper(paste0(",", sr[ii,1],'->',sr[ii,2])))))
tolower(substr(x[x==toupper(x)], 2, 100)) %>%
sapply(function(x) x %>% strsplit(',') %>% unlist) %>% t
}
[,1] [,2] [,3] [,4] [,5] [,6]
c->k,ou->uh,n->n,t->t,r->r,y->ee "c->k" "ou->uh" "n->n" "t->t" "r->r" "y->ee"
c->s,ou->uh,n->n,t->t,r->r,y->ee "c->s" "ou->uh" "n->n" "t->t" "r->r" "y->ee"
# match_rec("country", sounddef)
rbind(microbenchmark::microbenchmark(match_rec("country", sounddef)),
microbenchmark::microbenchmark(match_rec2("country", sounddef)))
#Unit: microseconds
expr min lq mean median uq max neval
match_rec("country", sounddef) 994.215 1020.542 1167.747 1043.746 1440.897 1609.574 100
match_rec2("country", sounddef) 41038.107 44909.427 52217.281 49015.023 54858.039 86680.030 100
Upvotes: 0
Reputation: 3184
I tackled this in a slightly different way, and added some support for corner cases like the same characters appearing multiple times, and the need to select between multiple matches (by taking the longest). Note that I used a few functions from stringr and purrr packages. I'm sure the function can be optimized, but might get you off to a start...
library(stringr)
library(purrr)
match_rec <- function(x, sound_dict) {
if (!nzchar(x)) return("")
# Helper variables
key_matches <- c() # This can be optimized if number of possible matches is known
char_keys <- sound_dict[,1]
unique_keys <- unique(char_keys)
while(nzchar(x)) {
# Find matches to beginning of string
matches <- str_detect(x, paste0("^", unique_keys))
if (any(matches)) {
# Take the longest match
char_match <- max(unique_keys[matches])
key_matches <- c(key_matches, char_match)
x <- str_sub(x, 1 + nchar(char_match))
} else {
x <- str_sub(x, 2)
}
}
# Return all pronunciation permutations
expand.grid(
map(key_matches, ~ paste(., sound_dict[. == char_keys, 2], sep = "->"))
)
}
Some examples of output...
sounddef <- t(matrix(strsplit('c,k,c,s,ou,uh,n,n,t,t,r,r,y,ee,w,w,o,oh,o,uh,th,th,s,s,m,m',",")[[1]], nrow = 2))
match_rec("country", sounddef)
#> Var1 Var2 Var3 Var4 Var5 Var6
#> 1 c->k ou->uh n->n t->t r->r y->ee
#> 2 c->s ou->uh n->n t->t r->r y->ee
match_rec("counro", sounddef)
#> Var1 Var2 Var3 Var4 Var5
#> 1 c->k ou->uh n->n r->r o->oh
#> 2 c->s ou->uh n->n r->r o->oh
#> 3 c->k ou->uh n->n r->r o->uh
#> 4 c->s ou->uh n->n r->r o->uh
match_rec("ccwouo", sounddef)
#> Var1 Var2 Var3 Var4 Var5
#> 1 c->k c->k w->w ou->uh o->oh
#> 2 c->s c->k w->w ou->uh o->oh
#> 3 c->k c->s w->w ou->uh o->oh
#> 4 c->s c->s w->w ou->uh o->oh
#> 5 c->k c->k w->w ou->uh o->uh
#> 6 c->s c->k w->w ou->uh o->uh
#> 7 c->k c->s w->w ou->uh o->uh
#> 8 c->s c->s w->w ou->uh o->uh
match_rec("", sounddef)
#> [1] ""
Upvotes: 1