Jay Frias
Jay Frias

Reputation: 53

Is there a way to make this Wordle-like SQL query using advanced BigQuery features?

I'm trying to do some word/letter analysis using BigQuery using rules similar to Wordle.

You know the drill:

I have a working query but I'm not really satisfied for three reasons:

  1. It doesn't look too BigQuery-y
  2. For large sets of words, the query doesn't scale well
  3. Extending to, say, 7 letter words would look really, REALLY, REALLY horrible, messy and any number of nasty adjectives could easily describe it

The way I'm doing this vaguely reminds me of recursive CTE, but sadly those are totally elusive to me... 😅

Making it faster is not really a concern but I'm wondering if there's a more advanced/elegant way (for lack of a better term) to accomplish the same result: recursive CTE, TVFs I guess using Javascript UDFs is an option, but I'd like to stay away from them and instead explore an SQL-only alternative.

This is the ugly query:

with words as (
    select normalized_word as word, 'solution' word_type
    from unnest(["SCAPE","PETER","SMACK","MAMMA"]) as normalized_word
),
split_word as (
    select word,letter,word_type,
    row_number() over(partition by word) as pos
    from words, unnest(split(word,'')) as letter
),
check_correct as (
    select a.word word1,any_value(a.word_type) as word_type,b.word word2,
     string_agg(if(a.letter=b.letter,'🟩',a.letter),'' order by a.pos) w1,
     string_agg(if(a.letter=b.letter,'❌',b.letter),'' order by a.pos) w2,
     sum(if(a.letter=b.letter,1,0)) c,0 p
    from split_word as a, split_word as b
    where a.word <> b.word
    and b.word_type = 'solution'
    and a.pos = b.pos
    group by a.word,b.word
),
check_first as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,1,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,1,1))>0,concat('🟨',right(w1,4)),w1) w1,
    if(instr(w2,substring(w1,1,1))>0,
      concat(
        left(w2,instr(w2,substring(w1,1,1))-1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,1,1)))),
      w2) w2,
    from check_correct
),
check_second as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,2,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,2,1))>0,concat(left(w1,1),'🟨',right(w1,3)),w1) w1,
    if(instr(w2,substring(w1,2,1))>0,
      concat(
        left(w2,instr(w2,substring(w1,2,1))-1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,2,1)))),
      w2) w2,
    from check_first
),
check_third as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,3,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,3,1))>0,concat(left(w1,2),'🟨',right(w1,2)),w1) w1,
    if(instr(w2,substring(w1,3,1))>0,
      concat(
        left(w2,instr(w2,substring(w1,3,1))-1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,3,1)))),
      w2) w2,
    from check_second
),
check_fourth as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,4,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,4,1))>0,concat(left(w1,3),'🟨',right(w1,1)),w1) w1, 
    if(instr(w2,substring(w1,4,1))>0,
      concat(
        left(w2,instr(w2,substring(w1,4,1))-1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,4,1)))),
      w2) w2,
    from check_third
),
check_fifth as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,5,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,5,1))>0,concat(left(w1,4),'🟨'),w1) w1,
    if(instr(w2,substring(w1,5,1))>0, 
      concat(
        left(w2,instr(w2,substring(w1,5,1)) - 1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,5,1)))),
      w2) w2, -- for completeness
    from check_fourth
),
final_result as (
    select word1 as guess, word_type as guess_type, word2 as solution, c as correct, p as present, 
    length(w1)-c-p as absent,
    regexp_replace(w1, r'([A-Z])', '⬜') as wordle, w1, w2 
    from check_fifth
) 

select *
from final_result
order by guess

The result looks like this: All 12 rows

Note some interesting edge cases with repeated letters in the guess word: 2, 3, 5. My ugly query manages this by "consuming" matched letters in solution and replacing them with ❌ to avoid further matches. That's the sole purpose of w2: avoid more than one match.

Update

Mikhail's solution almost works, but fails when guess word has repeated letters: Almost works!

Upvotes: 0

Views: 262

Answers (1)

Mikhail Berlyant
Mikhail Berlyant

Reputation: 173106

Consider below approach - will work for any number of letters

with words as (
    select normalized_word as word, 'solution' word_type
    from unnest(["SHAPE","PETER","TAPES","JUMBO","NINJA","MAMMA"]) as normalized_word
), pairs as (
  select t1.word as guess, t2.word as solution
  from words t1, words t2
), greens as (
  select guess, solution, x, offset, color
  from pairs t, unnest(array(
      select as struct x, offset, if(x=y, '🟩', '⬜') color
      from unnest(split(guess, '')) x with offset 
      left join unnest(split(solution, '')) y with offset 
      using(offset)
    ))
  where guess != solution
), yellows_temp as (
  select guess, solution, x, color
  from pairs t, unnest(array(
      select as struct x, '🟨' color
      from unnest(split(guess, '')) x with offset as pos1
      join unnest(split(solution, '')) y with offset as pos2
      on x = y and pos1 != pos2
      group by x, color
    ))
  where guess != solution
), yellows as (
  select guess, solution, x, '🟨' color 
  from yellows_temp y
  left join (
    select guess, solution, x
    from greens
    where color = '🟩'
  ) g
  using (guess, solution, x)
  where g.x is null
)
select guess, solution, 
  countif(g.color = '🟩') correct, 
  count(distinct if(g.color = '⬜' and y.color = '🟨', y.x, null)) present,
  string_agg(if(g.color = '⬜' and y.color = '🟨', '🟨', g.color), '' order by offset) as wordle,
  string_agg(if(g.color = '⬜' and y.color = '🟨', '🟨', if(g.color = '🟩', '🟩', g.x)), '' order by offset) as w1
from greens g
left join yellows y 
using(guess, solution, x)
group by guess, solution
-- order by guess, solution    

with output

enter image description here

Leaving w2 for you - should be relatively simple using above as a starting point :o)

Upvotes: 2

Related Questions