Reputation: 2570
I would like to create a query that does the following:
Using a regex_mapping
table, find all rows in the sample data
that REGEXP_MATCH
on x
WITH sample_data AS (
SELECT x, y
FROM (SELECT "asd rmkt asdf" AS x, true AS y UNION ALL -- should map to remekartier
SELECT "as asdf", true UNION ALL -- should map to ali sneider
SELECT "asdafsd", false) -- should map to NULL
),
regex_mapping AS (
SELECT regex, map
FROM (SELECT "as" AS regex, "ali sneider" AS map UNION ALL
SELECT "rmkt" AS regex, "remekartier" AS map )
)
SELECT sample_data.*, mapped_item
FROM sample_data
-- but here, use multiple REGEXP_MATCH with CASE WHEN looping over the regex_mappings.
-- e.g. CASE WHEN REGEXP_MATCH(x, "as") THEN "ali sneider"
WHEN REGEXP_MATCH(x, "rmkt") THEN "remakrtier" END AS mapped_item)
Upvotes: 0
Views: 408
Reputation: 156
Try this -
WITH sample_data AS (
SELECT x, y
FROM (SELECT "asd rmkt asdf" AS x, true AS y UNION ALL -- should map to remekartier
SELECT "as asdf", true UNION ALL -- should map to ali sneider
SELECT "asdafsd", false)
),
regex_mapping AS (
SELECT regex, map
FROM (SELECT "as" AS regex, "ali sneider" AS map UNION ALL
SELECT "rmkt" AS regex, "remekartier" AS map )
)
SELECT s.*, r.map
FROM sample_data s, regex_mapping r
WHERE regexp_contains(s.x,concat('\\b',r.regex,'\\b'))
The results ->
Second way: Instead of cross-join, use a scalar subquery. I have used limit so that the subquery doesn't return more than 1 row and if multiple regexp matches, then it will return only one of them
--- same WITH clause as above query ---
SELECT s.*, (SELECT r.map
FROM regex_mapping r
WHERE regexp_contains(s.x,concat('\\b',r.regex,'\\b'))
LIMIT 1) as map
FROM sample_data s
The results ->
Third way: Deduplicated Data
WITH sample_data AS (
SELECT campaign_name, placement_name
FROM (SELECT "as_rmkt_asdf" AS campaign_name, "xdd" AS placement_name UNION ALL -- should map to remekartier
SELECT "as_asdf", "sdfsdf" UNION ALL -- should map to ali sneider
SELECT "as_rmkt_dafsd", "sdfg" UNION ALL -- should map to rmkt
SELECT "asf_adsdf", "gdf" -- should map to NULL (because higher priority)
)
),
regex_mapping AS (
SELECT regex, map, priority
FROM (SELECT "rmkt" AS regex, "remekartier" AS map, 1 AS priority UNION ALL
SELECT "as" AS regex, "ali sneider" AS map, 2 AS priority)
),
X AS (
SELECT s.*,
CASE WHEN regexp_contains(s.campaign_name, concat('(^|_)',r.regex,'($|_)')) THEN r.map ELSE NULL END AS map,
ROW_NUMBER() OVER (PARTITION BY s.campaign_name ORDER BY regexp_contains(s.campaign_name, concat('(^|_)',r.regex,'($|_)')) DESC, r.priority) AS rn
FROM sample_data s
CROSS JOIN regex_mapping r
)
SELECT * EXCEPT (rn)
FROM X
WHERE rn = 1
Upvotes: 1