Reputation: 2130
I am trying to perform what, you would think, is a trivial operation in BigQuery; I am trying to update a nested field in a BigQuery table that is the result of a 360 export.
Here is my query:
#standardSQL
UPDATE `dataset_name`.`ga_sessions_20170705`
SET hits.eventInfo.eventLabel = 'some string'
WHERE TRUE
But I get this error message:
Error: Cannot access field eventInfo on a value with type ARRAY<STRUCT<item STRUCT<transactionId INT64, currencyCode STRING>, isEntrance BOOL, minute INT64, ...>> at [3:10]
How can I update this nested field?
Upvotes: 7
Views: 12128
Reputation: 1
If you want to mask PII, you have to take into account that it can be located elsewhere than in the pagePath (for example in pagePathLevel). Beware that the structure stored in BigQuery does not respect the documentation.
Here is an example to mask email, lastname and firstname.
update
`<project-id>.<dataset-name>.<table-name>`
set
hits = (
SELECT
ARRAY(
SELECT
AS STRUCT * REPLACE (
-- correcting pages here
IF(
(
TRUE = REGEXP_CONTAINS(
page.pagePath, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)"
)
OR TRUE = REGEXP_CONTAINS(
page.pagePath, r "lastname=([a-zA-Z0-9_\-\.]+)&"
)
OR TRUE = REGEXP_CONTAINS(
page.pagePath, r "firstname=([a-zA-Z0-9_\-\.]+)&"
)
),
STRUCT(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
page.pagePath, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)",
"[REDACTED]"
),
r "lastname=([a-zA-Z0-9_\-\.]+)&",
"lastname=[REDACTED]&"
),
r "firstname=([a-zA-Z0-9_\-\.]+)&",
"firstname=[REDACTED]&"
),
page.hostname,
page.pageTitle,
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
page.searchKeyword, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)",
"[REDACTED]"
),
r "lastname=([a-zA-Z0-9_\-\.]+)&",
"lastname=[REDACTED]&"
),
r "firstname=([a-zA-Z0-9_\-\.]+)&",
"firstname=[REDACTED]&"
),
page.searchCategory,
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
page.pagePathLevel1, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)",
"[REDACTED]"
),
r "lastname=([a-zA-Z0-9_\-\.]+)&",
"lastname=[REDACTED]&"
),
r "firstname=([a-zA-Z0-9_\-\.]+)&",
"firstname=[REDACTED]&"
),
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
page.pagePathLevel2, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)",
"[REDACTED]"
),
r "lastname=([a-zA-Z0-9_\-\.]+)&",
"lastname=[REDACTED]&"
),
r "firstname=([a-zA-Z0-9_\-\.]+)&",
"firstname=[REDACTED]&"
),
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
page.pagePathLevel3, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)",
"[REDACTED]"
),
r "lastname=([a-zA-Z0-9_\-\.]+)&",
"lastname=[REDACTED]&"
),
r "firstname=([a-zA-Z0-9_\-\.]+)&",
"firstname=[REDACTED]&"
),
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
page.pagePathLevel4, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)",
"[REDACTED]"
),
r "lastname=([a-zA-Z0-9_\-\.]+)&",
"lastname=[REDACTED]&"
),
r "firstname=([a-zA-Z0-9_\-\.]+)&",
"firstname=[REDACTED]&"
)
),
page
) AS page
)
from
UNNEST(hits)
)
)
WHERE
(
-- only relevant sessions
EXISTS(
SELECT
*
FROM
UNNEST(hits) AS hits
WHERE
(
TRUE = REGEXP_CONTAINS(
hits.page.pagePath, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)"
)
OR TRUE = REGEXP_CONTAINS(
hits.page.pagePath, r "lastname=([a-zA-Z0-9_\-\.]+)&"
)
OR TRUE = REGEXP_CONTAINS(
hits.page.pagePath, r "firstname=([a-zA-Z0-9_\-\.]+)&"
)
)
)
);
Upvotes: 0
Reputation: 2759
Here is to mask PII data in ga sessions having an email
UPDATE
`<project-id>.<dataset-name>.<table-name>`
SET hits =
ARRAY(SELECT AS STRUCT * REPLACE (
-- correcting pages here
IF(REGEXP_CONTAINS(page.pagePath, r"@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
,STRUCT(
REGEXP_REPLACE(page.pagePath, r"@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", "[EMAIL]")
,page.pagePathLevel1
,page.pagePathLevel2
,page.pagePathLevel3
,page.pagePathLevel4
,page.hostname
,page.pageTitle
,page.searchKeyword
,page.searchCategory
), page) AS page)
FROM UNNEST(hits)
)
WHERE ( -- only relevant sessions
SELECT COUNT(1) > 0
FROM UNNEST(hits) AS hits
WHERE totals.visits = 1
AND hits.type = 'PAGE'
AND REGEXP_CONTAINS(hits.page.pagePath, r"@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") = true
)
Upvotes: 0
Reputation: 61
If you need to modify a given custom dimension you can use this:
#standardSQL
UPDATE `tablename`
SET hits =
ARRAY(
SELECT AS STRUCT * REPLACE(
ARRAY(
SELECT AS STRUCT cd.index,
CASE WHEN cd.index = index_number THEN 'new value'
ELSE cd.value
END
FROM UNNEST(customDimensions) AS cd
) AS customDimensions)
FROM UNNEST(hits) hit
)
WHERE TRUE
But it takes a while to run.
Upvotes: 6
Reputation: 33705
hits
is an array, so you need to use an array subquery to assign to it. It would look something like this:
#standardSQL
UPDATE `dataset_name`.`ga_sessions_20170705`
SET hits =
ARRAY(
SELECT AS STRUCT * REPLACE(
(SELECT AS STRUCT eventInfo.* REPLACE('some string' AS eventLabel)) AS eventInfo)
FROM UNNEST(hits)
)
WHERE TRUE;
Upvotes: 9