Mookayama
Mookayama

Reputation: 1343

google-bigquery or amazon-redshift with custom score function

We have the following code in oracle pl/sql which calculate the similarity between two string using Jaro-Winkler. What we try to do is, to find any dup based on the similarity of two string. Our user case is the user can enter the user info e.g. first_name,last_name but has typo and the only key we can use is the first_name,last_name. There is no other piece of unique identifier like SSN or email for identifying a user. So our idea is to do a self join on the first_name,last_name and then get the score of similarity and based on that we can identify the dup.

However, even with 10,000 user there will be 100,000,000 operations to do the match and we try this in oracle db and it is just too slow.

We are new to google-bigquery or Amazon Red-shift. Is there a tutorial on something like this how to implement a custom function in our data-set.

Or google-bigquery or Amazon Red Shift already have solution similar to those in oracle ??

Our current environment is not feasible to do this proof of concept so we like to do this exercise in the cloud.

Thanks for all the help.

--http://www.orafaq.com/forum/t/164224/
CREATE OR REPLACE FUNCTION GKN_COMMON.jws -- Jaro-Winkler similarity
  (p_string1     IN VARCHAR2,
   p_string2     IN VARCHAR2)
  RETURN            NUMBER
  DETERMINISTIC
AS
  v_string1         VARCHAR2 (32767);
  v_string2         VARCHAR2 (32767);
  v_closeness       NUMBER := 0;
  v_temp            VARCHAR2 (32767);
  v_comp1           VARCHAR2 (32767);
  v_comp2           VARCHAR2 (32767);
  v_matches         NUMBER := 0; 
  v_char            VARCHAR2 (1);
  v_transpositions  NUMBER := 0;
  v_d_jaro          NUMBER := 0;
  v_leading         NUMBER := 0;
  v_d_winkler       NUMBER := 0;
  v_jws             NUMBER := 0;
BEGIN
  -- check for null strings:
  IF p_string1 IS NULL OR p_string2 IS NULL THEN 
    RETURN 0;
  END IF;
  -- remove accents:
  v_string1 := translate (p_string1,
            '?S?Zs?z?AAA?A??CEEEEIIII??OOO?O?UUUUY?aaa?a??ceeeeiiii???ooo?ouuuuyy??',
            'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
  v_string2 := translate (p_string2,
            '?S?Zs?z?AAA?A??CEEEEIIII??OOO?O?UUUUY?aaa?a??ceeeeiiii???ooo?ouuuuyy??',
            'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
  -- closeness:
  v_closeness := (GREATEST (LENGTH (v_string1), LENGTH (v_string2)) / 2) - 1;
  -- find matching characters and transpositions within closeness:
  v_temp := v_string2;
  FOR i IN 1 .. LENGTH (v_string1) LOOP
    IF INSTR (v_temp, SUBSTR (v_string1, i, 1)) > 0 THEN
      v_char := SUBSTR (v_string1, i, 1);
      IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
        v_comp1 := v_comp1 || SUBSTR (v_string1, i, 1);
        v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string1, i, 1)) - 1)
               || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string1, i, 1)) + 1);
      END IF;
    END IF;    
  END LOOP;
  v_temp := v_string1;
  FOR i IN 1 .. LENGTH (v_string2) LOOP
    IF INSTR (v_temp, SUBSTR (v_string2, i, 1)) > 0 THEN
      v_char := SUBSTR (v_string2, i, 1);
      IF ABS (INSTR (v_string2, v_char) - INSTR (v_string1, v_char)) <= v_closeness THEN
        v_comp2 := v_comp2 || SUBSTR (v_string2, i, 1);
        v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string2, i, 1)) - 1)
               || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string2, i, 1)) + 1);
      END IF;
    END IF;    
  END LOOP;
  -- check for null strings:
  IF v_comp1 IS NULL OR v_comp2 IS NULL THEN 
    RETURN 0;
  END IF;
  -- count matches and transpositions within closeness:
  FOR i IN 1 .. LEAST (LENGTH (v_comp1), LENGTH (v_comp2)) LOOP
    IF SUBSTR (v_comp1, i, 1) = SUBSTR (v_comp2, i, 1) THEN
      v_matches := v_matches + 1;
    ELSE
      v_char := SUBSTR (v_comp1, i, 1);
      IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
        v_transpositions := v_transpositions + 1;
        v_matches := v_matches + 1;
      END IF; 
    END IF;
  END LOOP;
  v_transpositions := v_transpositions / 2;
  -- check for no matches:
  IF v_matches = 0
    THEN RETURN 0;
  END IF;
  -- Jaro:
  v_d_jaro := ((v_matches / LENGTH (v_string1)) + 
               (v_matches / LENGTH (v_string2)) +
               ((v_matches - v_transpositions) / v_matches)) 
               / 3;
  -- count matching leading characters (up to 4):
  FOR i IN 1 .. LEAST (LENGTH (v_string1), LENGTH (v_string2), 4) LOOP
    IF SUBSTR (v_string1, i, 1) = SUBSTR (v_string2, i, 1) THEN
      v_leading := v_leading + 1;
    ELSE
      EXIT;
    END IF;
  END LOOP;
  -- Winkler:
  v_d_winkler := v_d_jaro + ((v_leading * .1) * (1 - v_d_jaro));
  -- Jaro-Winkler similarity rounded:
  v_jws := ROUND (v_d_winkler * 100);
  RETURN v_jws;
END jws;





 WITH
   strings AS
      (SELECT NULL          string1, NULL        string2 FROM DUAL UNION ALL
       SELECT 'test'       string1, NULL        string2 FROM DUAL UNION ALL
       SELECT NULL          string1, 'test'        string2 FROM DUAL UNION ALL
       SELECT 'CRATE'      string1, 'TRACE'        string2 FROM DUAL UNION ALL
       SELECT 'MARTHA'     string1, 'MARHTA'     string2 FROM DUAL UNION ALL
       SELECT 'DWAYNE'     string1, 'DUANE'        string2 FROM DUAL UNION ALL
       SELECT 'DIXON'      string1, 'DICKSONX'   string2 FROM DUAL UNION ALL
       SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL
       SELECT 'Abroms'     string1, 'Abrams'     string2 FROM DUAL UNION ALL
       SELECT 'Lampley'    string1, 'Campley'    string2 FROM DUAL UNION ALL
       SELECT 'Jonathon'   string1, 'Jonathan'   string2 FROM DUAL UNION ALL
       SELECT 'Jeraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
       SELECT 'test'       string1, 'blank'        string2 FROM DUAL UNION ALL
       SELECT 'everybody'  string1, 'every'        string2 FROM DUAL UNION ALL
       SELECT 'a'          string1, 'aaa'        string2 FROM DUAL UNION ALL
       SELECT 'Géraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
       SELECT 'Jérôme'     string1, 'Jerome'     string2 FROM DUAL UNION ALL
       SELECT 'ça'          string1, 'ca'        string2 FROM DUAL UNION ALL
       SELECT 'Üwe'          string1, 'Uwe'        string2 FROM DUAL)
 SELECT string1, string2,
         --UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
         jws (string1, string2) my_jws
 FROM   strings
 ORDER  BY my_jws DESC

Upvotes: 0

Views: 605

Answers (1)

Mikhail Berlyant
Mikhail Berlyant

Reputation: 173046

Check below example
It is for BigQuery with Standard SQL (check Enabling Standard SQL) and uses JS User-Defined Functions

  CREATE TEMPORARY FUNCTION similariry(Name1 STRING, Name2 STRING)
  RETURNS FLOAT64
  LANGUAGE js AS """
    var _extend = function(dst) {
      var sources = Array.prototype.slice.call(arguments, 1);
      for (var i=0; i<sources.length; ++i) {
        var src = sources[i];
        for (var p in src) {
          if (src.hasOwnProperty(p)) dst[p] = src[p];
        }
      }
      return dst;
    };

    var Levenshtein = {
      /**
       * Calculate levenshtein distance of the two strings.
       *
       * @param str1 String the first string.
       * @param str2 String the second string.
       * @return Integer the levenshtein distance (0 and above).
       */
      get: function(str1, str2) {
        // base cases
        if (str1 === str2) return 0;
        if (str1.length === 0) return str2.length;
        if (str2.length === 0) return str1.length;

        // two rows
        var prevRow  = new Array(str2.length + 1),
            curCol, nextCol, i, j, tmp;

        // initialise previous row
        for (i=0; i<prevRow.length; ++i) {
          prevRow[i] = i;
        }

        // calculate current row distance from previous row
        for (i=0; i<str1.length; ++i) {
          nextCol = i + 1;

          for (j=0; j<str2.length; ++j) {
            curCol = nextCol;

            // substution
            nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
            // insertion
            tmp = curCol + 1;
            if (nextCol > tmp) {
              nextCol = tmp;
            }
            // deletion
            tmp = prevRow[j + 1] + 1;
            if (nextCol > tmp) {
              nextCol = tmp;
            }

            // copy current col value into previous (in preparation for next iteration)
            prevRow[j] = curCol;
          }

          // copy last col value into previous (in preparation for next iteration)
          prevRow[j] = nextCol;
        }

        return nextCol;
      }

    };

    var the_Name1;

    try {
      the_Name1 = decodeURI(Name1).toLowerCase();
    } catch (ex) {
      the_Name1 = Name1.toLowerCase();
    }

    try {
      the_Name2 = decodeURI(Name2).toLowerCase();
    } catch (ex) {
      the_Name2 = Name2.toLowerCase();
    }

    return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;

  """;

  WITH strings AS (
  SELECT NULL          string1, NULL        string2 UNION ALL
    SELECT 'test'       string1, NULL        string2 UNION ALL
    SELECT NULL          string1, 'test'        string2 UNION ALL
    SELECT 'CRATE'      string1, 'TRACE'        string2 UNION ALL
    SELECT 'MARTHA'     string1, 'MARHTA'     string2 UNION ALL
    SELECT 'DWAYNE'     string1, 'DUANE'        string2 UNION ALL
    SELECT 'DIXON'      string1, 'DICKSONX'   string2 UNION ALL
    SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
    SELECT 'Abroms'     string1, 'Abrams'     string2 UNION ALL
    SELECT 'Lampley'    string1, 'Campley'    string2 UNION ALL
    SELECT 'Jonathon'   string1, 'Jonathan'   string2 UNION ALL
    SELECT 'Jeraldine'  string1, 'Gerladine'  string2 UNION ALL
    SELECT 'test'       string1, 'blank'        string2 UNION ALL
    SELECT 'everybody'  string1, 'every'        string2 UNION ALL
    SELECT 'a'          string1, 'aaa'        string2 UNION ALL
    SELECT 'Géraldine'  string1, 'Gerladine'  string2 UNION ALL
    SELECT 'Jérôme'     string1, 'Jerome'     string2 UNION ALL
    SELECT 'ça'          string1, 'ca'        string2 UNION ALL
    SELECT 'Üwe'          string1, 'Uwe'        string2 
  )
  SELECT string1, string2, similariry(string1, string2) my_sim
  FROM   strings
  ORDER  BY my_sim DESC

Upvotes: 5

Related Questions