Jose B
Jose B

Reputation: 2130

Update nested field in BigQuery table

I am trying to perform what, you would think, is a trivial operation in BigQuery; I am trying to update a nested field in a BigQuery table that is the result of a 360 export.

Here is my query:

#standardSQL
UPDATE `dataset_name`.`ga_sessions_20170705`
SET hits.eventInfo.eventLabel = 'some string'
WHERE TRUE

But I get this error message:

Error: Cannot access field eventInfo on a value with type ARRAY<STRUCT<item STRUCT<transactionId INT64, currencyCode STRING>, isEntrance BOOL, minute INT64, ...>> at [3:10]

How can I update this nested field?

Upvotes: 7

Views: 12128

Answers (4)

QDV
QDV

Reputation: 1

If you want to mask PII, you have to take into account that it can be located elsewhere than in the pagePath (for example in pagePathLevel). Beware that the structure stored in BigQuery does not respect the documentation.

Here is an example to mask email, lastname and firstname.

update 
  `<project-id>.<dataset-name>.<table-name>` 
set 
  hits = (
    SELECT 
      ARRAY(
        SELECT 
          AS STRUCT * REPLACE (
            -- correcting pages here
            IF(
              (
                TRUE = REGEXP_CONTAINS(
                  page.pagePath, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)"
                ) 
                OR TRUE = REGEXP_CONTAINS(
                  page.pagePath, r "lastname=([a-zA-Z0-9_\-\.]+)&"
                ) 
                OR TRUE = REGEXP_CONTAINS(
                  page.pagePath, r "firstname=([a-zA-Z0-9_\-\.]+)&"
                )
              ), 
              STRUCT(
                REGEXP_REPLACE(
                  REGEXP_REPLACE(
                    REGEXP_REPLACE(
                      page.pagePath, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)", 
                      "[REDACTED]"
                    ), 
                    r "lastname=([a-zA-Z0-9_\-\.]+)&", 
                    "lastname=[REDACTED]&"
                  ), 
                  r "firstname=([a-zA-Z0-9_\-\.]+)&", 
                  "firstname=[REDACTED]&"
                ), 
                page.hostname, 
                page.pageTitle, 
                REGEXP_REPLACE(
                  REGEXP_REPLACE(
                    REGEXP_REPLACE(
                      page.searchKeyword, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)", 
                      "[REDACTED]"
                    ), 
                    r "lastname=([a-zA-Z0-9_\-\.]+)&", 
                    "lastname=[REDACTED]&"
                  ), 
                  r "firstname=([a-zA-Z0-9_\-\.]+)&", 
                  "firstname=[REDACTED]&"
                ), 
                page.searchCategory, 
                REGEXP_REPLACE(
                  REGEXP_REPLACE(
                    REGEXP_REPLACE(
                      page.pagePathLevel1, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)", 
                      "[REDACTED]"
                    ), 
                    r "lastname=([a-zA-Z0-9_\-\.]+)&", 
                    "lastname=[REDACTED]&"
                  ), 
                  r "firstname=([a-zA-Z0-9_\-\.]+)&", 
                  "firstname=[REDACTED]&"
                ), 
                REGEXP_REPLACE(
                  REGEXP_REPLACE(
                    REGEXP_REPLACE(
                      page.pagePathLevel2, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)", 
                      "[REDACTED]"
                    ), 
                    r "lastname=([a-zA-Z0-9_\-\.]+)&", 
                    "lastname=[REDACTED]&"
                  ), 
                  r "firstname=([a-zA-Z0-9_\-\.]+)&", 
                  "firstname=[REDACTED]&"
                ), 
                REGEXP_REPLACE(
                  REGEXP_REPLACE(
                    REGEXP_REPLACE(
                      page.pagePathLevel3, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)", 
                      "[REDACTED]"
                    ), 
                    r "lastname=([a-zA-Z0-9_\-\.]+)&", 
                    "lastname=[REDACTED]&"
                  ), 
                  r "firstname=([a-zA-Z0-9_\-\.]+)&", 
                  "firstname=[REDACTED]&"
                ), 
                REGEXP_REPLACE(
                  REGEXP_REPLACE(
                    REGEXP_REPLACE(
                      page.pagePathLevel4, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)", 
                      "[REDACTED]"
                    ), 
                    r "lastname=([a-zA-Z0-9_\-\.]+)&", 
                    "lastname=[REDACTED]&"
                  ), 
                  r "firstname=([a-zA-Z0-9_\-\.]+)&", 
                  "firstname=[REDACTED]&"
                )
              ), 
              page
            ) AS page
          ) 
        from 
          UNNEST(hits)
      )
  ) 
WHERE 
  (
    -- only relevant sessions
    EXISTS(
      SELECT 
        * 
      FROM 
        UNNEST(hits) AS hits 
      WHERE 
        (
          TRUE = REGEXP_CONTAINS(
            hits.page.pagePath, r "([a-zA-Z0-9_\-\.]*)@([a-zA-Z0-9_\-\.]*)"
          ) 
          OR TRUE = REGEXP_CONTAINS(
            hits.page.pagePath, r "lastname=([a-zA-Z0-9_\-\.]+)&"
          ) 
          OR TRUE = REGEXP_CONTAINS(
            hits.page.pagePath, r "firstname=([a-zA-Z0-9_\-\.]+)&"
          )
        )
    )
  );

Upvotes: 0

Jai Prakash
Jai Prakash

Reputation: 2759

Here is to mask PII data in ga sessions having an email

UPDATE
     `<project-id>.<dataset-name>.<table-name>`
 
SET hits =
  ARRAY(SELECT AS STRUCT * REPLACE (
    -- correcting pages here
    IF(REGEXP_CONTAINS(page.pagePath, r"@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
    ,STRUCT(
        REGEXP_REPLACE(page.pagePath, r"@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", "[EMAIL]")
        ,page.pagePathLevel1
        ,page.pagePathLevel2
        ,page.pagePathLevel3
        ,page.pagePathLevel4
        ,page.hostname
        ,page.pageTitle
        ,page.searchKeyword
        ,page.searchCategory
    ), page) AS page)
    
    FROM UNNEST(hits)
  ) 
WHERE ( -- only relevant sessions
  SELECT COUNT(1) > 0 
  FROM UNNEST(hits) AS hits
  WHERE totals.visits = 1
    AND hits.type = 'PAGE'
    AND REGEXP_CONTAINS(hits.page.pagePath, r"@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") = true
)

Upvotes: 0

If you need to modify a given custom dimension you can use this:

#standardSQL
UPDATE `tablename`
SET hits = 
  ARRAY(
    SELECT AS STRUCT * REPLACE(
      ARRAY(
        SELECT AS STRUCT cd.index,
          CASE WHEN cd.index = index_number THEN 'new value'
          ELSE cd.value
          END
        FROM UNNEST(customDimensions) AS cd
      ) AS customDimensions)
    FROM UNNEST(hits) hit
  )
WHERE TRUE

But it takes a while to run.

Upvotes: 6

Elliott Brossard
Elliott Brossard

Reputation: 33705

hits is an array, so you need to use an array subquery to assign to it. It would look something like this:

#standardSQL
UPDATE `dataset_name`.`ga_sessions_20170705`
SET hits =
  ARRAY(
    SELECT AS STRUCT * REPLACE(
      (SELECT AS STRUCT eventInfo.* REPLACE('some string' AS eventLabel)) AS eventInfo)
    FROM UNNEST(hits)
  )
WHERE TRUE;

Upvotes: 9

Related Questions