Chinese Monger
Chinese Monger

Reputation: 29

Extract TLD from a domain name and group them based on TLD

I need to extract TLD from the web address and if it matches with a pre-defined set of TLDs(com,edu,nz,au), I need to sum it up. In case it doesn't match with the pre-defined TLD, It should be categorized in the "Other" category. If the webadress isn't available for a particular bussiness, it should be categorized in the "Not available".

Expected Output:

CLIENT TYPE     TOTAL
-------------  ----------
com             4
au              5
nz              0
Not Available   0
Other           0

I have written the following query but it does not give me rows which have 0 value.

select tld2, NVL(cnt,0)  from (select REGEXP_SUBSTR (webaddress, '\.([a-z]+)(/|$)', 1, 1, NULL, 1) as tld2, count(*) cnt from client group by REGEXP_SUBSTR (webaddress, '\.([a-z]+)(/|$)', 1, 1, NULL, 1))a where tld2 in ('com','edu','gov','org')
UNION ALL
select 'Not Available' as tld2, COUNT(webaddress) from client where webaddress is null
UNION
select 'Other' as tld2, NVL(cnt,0)  from (select REGEXP_SUBSTR (webaddress, '\.([a-z]+)(/|$)', 1, 1, NULL, 1) as tld2, count(*) cnt from client group by REGEXP_SUBSTR (webaddress, '\.([a-z]+)(/|$)', 1, 1, NULL, 1))a where tld2 not in ('com','edu','gov','org');

Can someone please guide me if I should use cases here?

Upvotes: 0

Views: 354

Answers (3)

MT0
MT0

Reputation: 167981

You can create a Java function to find the TLD (since your regular expression doesn't handle the case when there are port numbers, and possibly other edge cases such as https://localhost/not/at/example.com/, and using an API designed to handle URIs would be better):

CREATE AND COMPILE JAVA SOURCE NAMED URIHandler AS
import java.net.URI;
import java.net.URISyntaxException;

public class URIHandler {
  public static String getTLD( final String url )
  {
    String domain = null;
    try
    {
      URI uri = new URI( url );
      domain = uri.getHost();
    }
    catch ( URISyntaxException ex )
    {
    }
    if ( domain == null )
    {
        return null;
    }
    int index = domain.lastIndexOf( "." );
    return ( index >= 0 ? domain.substring( index + 1 ) : domain );
  }
}
/

Which you can then wrap in a PL/SQL function:

CREATE FUNCTION getTLD( url IN VARCHAR2 ) RETURN VARCHAR2
AS LANGUAGE JAVA NAME 'URIHandler.getTLD( java.lang.String ) return java.lang.String';
/

Then you can use the code:

WITH tlds ( tld ) AS (
  SELECT 'Not Available' FROM DUAL UNION ALL
  SELECT 'com'           FROM DUAL UNION ALL
  SELECT 'nz'            FROM DUAL UNION ALL
  SELECT 'au'            FROM DUAL UNION ALL
  SELECT 'Other'         FROM DUAL
),
matches ( match ) AS (
  SELECT DECODE(
           getTLD( url ),
           NULL,  'Not Available',
           'com', 'com',
           'au',  'au',
           'nz',  'nz',
                  'Other'
         )
  FROM   table_name
)
SELECT t.tld,
       COUNT( m.match )
FROM   tlds t
       LEFT OUTER JOIN matches m
       ON ( t.tld = m.match )
GROUP BY
       t.tld;

Which, for the sample data:

CREATE TABLE table_name ( url ) AS
SELECT 'http://example.com'      FROM DUAL UNION ALL
SELECT 'http://example.com:80/'  FROM DUAL UNION ALL
SELECT 'https://example.au'      FROM DUAL UNION ALL
SELECT 'https://example.au:442/' FROM DUAL UNION ALL
SELECT 'https://example.nz/not/at/example.com/' FROM DUAL UNION ALL
--SELECT 'https://example.net'     FROM DUAL UNION ALL
SELECT 'not a URI' FROM DUAL;

Outputs:

TLD           | COUNT(M.MATCH)
:------------ | -------------:
Other         |              0
com           |              2
nz            |              1
au            |              2
Not Available |              1

db<>fiddle here

Upvotes: 1

gogocho
gogocho

Reputation: 218

Please try A little edit of your approach:

select tld2, NVL(cnt,0)  from (select REGEXP_SUBSTR (webaddress, '\.([a-z]+)(/|$)', 1, 1, NULL, 1) as tld2, count(*) cnt from client group by REGEXP_SUBSTR (webaddress, '\.([a-z]+)(/|$)', 1, 1, NULL, 1))a where tld2 in ('com','edu','gov','org')
UNION ALL
select 'Not Available' as tld2, cnt from (select COUNT(webaddress) cnt from client where webaddress is null)
UNION
select 'Other' as tld2, cnt  from (select count(webaddress) cnt from client where REGEXP_SUBSTR (webaddress, '\.([a-z]+)(/|$)', 1, 1, NULL, 1) not in ('com','edu','gov','org'))a ;

Thanks.

Upvotes: 1

Elzzz
Elzzz

Reputation: 131

take a look at this example:

with sub as (
select case when webaddress is null then 'Not Available' 
             when domain_name  in ('com','edu','gov','org') then domain_name else 'Other' end client_type 
             from (
SELECT 
    regexp_substr(webaddress, '\.([a-z]+)(/|$)', 1, 1, NULL,
                  1) domain_name,
    webaddress
FROM
    (
        SELECT
            'https://stackoverflow.com/questions/65096217/' webaddress
        FROM
            dual
        UNION ALL
        SELECT
            'https://stackoverflow.edu/questions/65096217/' 
        FROM
            dual
        UNION ALL
        SELECT
            'https://stackoverflow.edu/questions/6509621/' 
        FROM
            dual
        UNION ALL
        select 'https://stackoverflow.de/questions/65096217/' 
        from dual
        /*UNION ALL
        select null 
        from dual*/
    ))),
cat as (select regexp_substr('Not Available,com,edu,gov,org,Other','[^,]+', 1, level ) val
from dual
connect by regexp_substr('Not Available,com,edu,gov,org,Other', '[^,]+', 1, level) is not null)
select c.val, sum(case when s.client_type is null then 0 else 1 end)
from sub s right outer join cat c on (c.val = s.client_type)
group by c.val;

previous (incomplete sol): very simple solution using plain group by and case stmt. could be this:

select  case when s.webaddress is null then 'Not Available' 
             when s.domain_name  in ('com','edu','gov','org') then s.domain_name else 'Other' end client_type, 
             count(*)
        from (SELECT
    regexp_substr(webaddress, '\.([a-z]+)(/|$)', 1, 1, NULL,
                  1) domain_name,
    webaddress
FROM
    (
        SELECT
            'https://stackoverflow.com/questions/65096217/' webaddress
        FROM
            dual
        UNION ALL
        SELECT
            'https://stackoverflow.edu/questions/65096217/' 
        FROM
            dual
        UNION ALL
        SELECT
            'https://stackoverflow.edu/questions/6509621/' 
        FROM
            dual
        UNION ALL
        SELECT
            NULL 
        FROM
            dual
        UNION ALL
        select 'https://stackoverflow.de/questions/65096217/' 
        from dual
    )) s
        group by case when s.webaddress is null then 'Not Available' 
             when s.domain_name  in ('com','edu','gov','org') then s.domain_name else 'Other' end
;

Upvotes: 0

Related Questions