Reputation: 53
I'm trying to do some word/letter analysis using BigQuery using rules similar to Wordle.
You know the drill:
I have a working query but I'm not really satisfied for three reasons:
The way I'm doing this vaguely reminds me of recursive CTE, but sadly those are totally elusive to me... 😅
Making it faster is not really a concern but I'm wondering if there's a more advanced/elegant way (for lack of a better term) to accomplish the same result: recursive CTE, TVFs I guess using Javascript UDFs is an option, but I'd like to stay away from them and instead explore an SQL-only alternative.
This is the ugly query:
with words as (
select normalized_word as word, 'solution' word_type
from unnest(["SCAPE","PETER","SMACK","MAMMA"]) as normalized_word
),
split_word as (
select word,letter,word_type,
row_number() over(partition by word) as pos
from words, unnest(split(word,'')) as letter
),
check_correct as (
select a.word word1,any_value(a.word_type) as word_type,b.word word2,
string_agg(if(a.letter=b.letter,'🟩',a.letter),'' order by a.pos) w1,
string_agg(if(a.letter=b.letter,'❌',b.letter),'' order by a.pos) w2,
sum(if(a.letter=b.letter,1,0)) c,0 p
from split_word as a, split_word as b
where a.word <> b.word
and b.word_type = 'solution'
and a.pos = b.pos
group by a.word,b.word
),
check_first as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,1,1))>0,p+1,p) p,
if(instr(w2,substring(w1,1,1))>0,concat('🟨',right(w1,4)),w1) w1,
if(instr(w2,substring(w1,1,1))>0,
concat(
left(w2,instr(w2,substring(w1,1,1))-1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,1,1)))),
w2) w2,
from check_correct
),
check_second as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,2,1))>0,p+1,p) p,
if(instr(w2,substring(w1,2,1))>0,concat(left(w1,1),'🟨',right(w1,3)),w1) w1,
if(instr(w2,substring(w1,2,1))>0,
concat(
left(w2,instr(w2,substring(w1,2,1))-1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,2,1)))),
w2) w2,
from check_first
),
check_third as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,3,1))>0,p+1,p) p,
if(instr(w2,substring(w1,3,1))>0,concat(left(w1,2),'🟨',right(w1,2)),w1) w1,
if(instr(w2,substring(w1,3,1))>0,
concat(
left(w2,instr(w2,substring(w1,3,1))-1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,3,1)))),
w2) w2,
from check_second
),
check_fourth as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,4,1))>0,p+1,p) p,
if(instr(w2,substring(w1,4,1))>0,concat(left(w1,3),'🟨',right(w1,1)),w1) w1,
if(instr(w2,substring(w1,4,1))>0,
concat(
left(w2,instr(w2,substring(w1,4,1))-1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,4,1)))),
w2) w2,
from check_third
),
check_fifth as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,5,1))>0,p+1,p) p,
if(instr(w2,substring(w1,5,1))>0,concat(left(w1,4),'🟨'),w1) w1,
if(instr(w2,substring(w1,5,1))>0,
concat(
left(w2,instr(w2,substring(w1,5,1)) - 1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,5,1)))),
w2) w2, -- for completeness
from check_fourth
),
final_result as (
select word1 as guess, word_type as guess_type, word2 as solution, c as correct, p as present,
length(w1)-c-p as absent,
regexp_replace(w1, r'([A-Z])', '⬜') as wordle, w1, w2
from check_fifth
)
select *
from final_result
order by guess
Note some interesting edge cases with repeated letters in the guess word: 2, 3, 5. My ugly query manages this by "consuming" matched letters in solution and replacing them with ❌ to avoid further matches. That's the sole purpose of w2
: avoid more than one match.
Update
Mikhail's solution almost works, but fails when guess word has repeated letters:
Upvotes: 0
Views: 262
Reputation: 173106
Consider below approach - will work for any number of letters
with words as (
select normalized_word as word, 'solution' word_type
from unnest(["SHAPE","PETER","TAPES","JUMBO","NINJA","MAMMA"]) as normalized_word
), pairs as (
select t1.word as guess, t2.word as solution
from words t1, words t2
), greens as (
select guess, solution, x, offset, color
from pairs t, unnest(array(
select as struct x, offset, if(x=y, '🟩', '⬜') color
from unnest(split(guess, '')) x with offset
left join unnest(split(solution, '')) y with offset
using(offset)
))
where guess != solution
), yellows_temp as (
select guess, solution, x, color
from pairs t, unnest(array(
select as struct x, '🟨' color
from unnest(split(guess, '')) x with offset as pos1
join unnest(split(solution, '')) y with offset as pos2
on x = y and pos1 != pos2
group by x, color
))
where guess != solution
), yellows as (
select guess, solution, x, '🟨' color
from yellows_temp y
left join (
select guess, solution, x
from greens
where color = '🟩'
) g
using (guess, solution, x)
where g.x is null
)
select guess, solution,
countif(g.color = '🟩') correct,
count(distinct if(g.color = '⬜' and y.color = '🟨', y.x, null)) present,
string_agg(if(g.color = '⬜' and y.color = '🟨', '🟨', g.color), '' order by offset) as wordle,
string_agg(if(g.color = '⬜' and y.color = '🟨', '🟨', if(g.color = '🟩', '🟩', g.x)), '' order by offset) as w1
from greens g
left join yellows y
using(guess, solution, x)
group by guess, solution
-- order by guess, solution
with output
Leaving w2 for you - should be relatively simple using above as a starting point :o)
Upvotes: 2