Reputation: 21
I am referring to this article in order to lookup IP addresses to country: https://cloud.google.com/blog/products/data-analytics/geolocation-with-bigquery-de-identify-76-million-ip-addresses-in-20-seconds
However, the public data set: fh-bigquery.geocode.201806_geolite2_city_ipv4_locs is no longer available. So I had to import my own look up table.
I have 2 tables in Bigquery:
Row| IP_CIDR (String)| Country_Code
-------------------------
1 | 160.181.205.0/24| ME
2 | 197.157.224.0/24| SL
3 | 196.32.200.0/21 | CF
4 | 160.181.224.0/24| SI
...
has 400K rows in BQ
and
Row| IP (String)
-------------------------
1 | 160.181.205.222
2 | 197.157.224.123
3 | 196.32.200.33
4 | 160.181.224.44
...
has 10k rows in BQ
How can I match the IP address to the IP_CIDR in another table such that the result would be:
Row| IP (String) |Country_Code
-------------------------
1 | 160.181.205.222 | ME
2 | 197.157.224.123 | SL
3 | 196.32.200.33 | CF
4 | 160.181.224.44 | SI
...
and null if IP not found in IP_CIDR
This is the original query recommended by the article:
WITH source_of_ip_addresses AS (
SELECT REGEXP_REPLACE(contributor_ip, 'xxx', '0') ip, COUNT(*) c
FROM `publicdata.samples.wikipedia`
WHERE contributor_ip IS NOT null
GROUP BY 1
)
SELECT country_name, SUM(c) c
FROM (
SELECT ip, country_name, c
FROM (
SELECT *, NET.SAFE_IP_FROM_STRING(ip) & NET.IP_NET_MASK(4, mask) network_bin
FROM source_of_ip_addresses, UNNEST(GENERATE_ARRAY(9,32)) mask
WHERE BYTE_LENGTH(NET.SAFE_IP_FROM_STRING(ip)) = 4
)
JOIN `fh-bigquery.geocode.201806_geolite2_city_ipv4_locs`
USING (network_bin, mask)
)
GROUP BY 1
ORDER BY 2 DESC
Upvotes: 1
Views: 1477
Reputation: 172994
Consider below approach
create temp function SubnetToRange(CIDR string)
returns struct<start_ip string, end_ip string>
language js as '''
var beg = CIDR.substr(CIDR,CIDR.indexOf('/'));
var end = beg;
var off = (1<<(32-parseInt(CIDR.substr(CIDR.indexOf('/')+1))))-1;
var sub = beg.split('.').map(function(a){return parseInt(a)});
var buf = new ArrayBuffer(4);
var i32 = new Uint32Array(buf);
i32[0] = (sub[0]<<24) + (sub[1]<<16) + (sub[2]<<8) + (sub[3]) + off;
var end = Array.apply([],new Uint8Array(buf)).reverse().join('.');
return {start_ip: beg, end_ip: end};
''';
select IP, Country_Code
from table2
left join (
select IP_CIDR, Country_Code, SubnetToRange(IP_CIDR).*
from table1
)
on net.ipv4_to_int64(net.ip_from_string(IP))
between net.ipv4_to_int64(net.ip_from_string(start_ip))
and net.ipv4_to_int64(net.ip_from_string(end_ip))
if applied to sample data as in your question
with table1 as (
select '160.181.205.0/24' IP_CIDR, 'ME' Country_Code union all
select '197.157.224.0/24', 'SL' union all
select '196.32.200.0/21', 'CF' union all
select '160.181.224.0/24', 'SI'
), table2 as (
select '160.181.205.222' IP union all
select '197.157.224.123' union all
select '196.32.200.33' union all
select '160.181.224.44'
)
the output is
Upvotes: 2
Reputation: 17126
you can use a query like below
WITH table2 as (
SELECT 1 as Row, '160.181.205.0/24' as IP_CIDRString,'ME' as Country_Code
UNION ALL
SELECT 4 as Row, '160.181.224.0/24' as IP_CIDRString,'SI' as Country_Code
)
, Table1 as (
SELECT 1 as Row,'160.181.205.222' as IPString
UNION ALL
SELECT 2 as Row,'197.157.224.123' as IPString
UNION ALL
SELECT 3 as Row,'196.32.200.33' as IPString
UNION ALL
SELECT 4 as Row,'160.181.224.44' as IPString
),
maptable as
(
select Row, Country_Code, POW(2,(32- CAST(RIGHT(IP_CIDRString,2) AS INT64)))-1 as cidrrange, SPLIT(IP_CIDRString,'.') as IPblocks
from Table2
),
sourcetable as
(
select Row, IPString, SPLIT(IPString,'.') as IPblocks
from Table1
)
select
A.Row
,A.IPString
,B.Country_Code
from
sourcetable A
left join maptable B
on A.IPblocks[ORDINAL(1)] = B.IPblocks[ORDINAL(1)]
AND A.IPblocks[ORDINAL(2)] = B.IPblocks[ORDINAL(2)]
AND A.IPblocks[ORDINAL(3)] = B.IPblocks[ORDINAL(3)]
AND CAST(A.IPblocks[ORDINAL(4)] AS INT64) BETWEEN 0 AND cidrrange
Upvotes: 1