Reputation: 682
I am currently working on a Medical Dosage data. It is a big dataset/oracle table with a string variable containing millions of records. the string variable looks like below:
Drug_Direction
(1 JAN) INJECT 2ML (100MG) IV/IM AM THEN 0.5ML (25MG) 20 MIN LATER, THEN 2.5ML (125MG) PM
(SEP 20, 2018) INJECT 0.3ML (30MG) ON S1, 0.6ML (60MG) ON S2 AND 2ML(200MG) ON S3
These are the sample records. I need to find the MG(Miligram) doses from this main string and calculate the sum. For example:
Required Output:
100+25+125=250
30+60+200=290`
Also, the string is not in a fixed format. and sometimes there are variations, such as only 2 or 1MG dose exist. In that case. I need to get only those MG doses. I understand i would may be need to count the MG occurrences, find the numbers and do a sum. I am working on it simultaneously.
Same Data is available in oracle too. So if there is an easier way to do this in Oracle-sql, that is welcome too.
Upvotes: 0
Views: 101
Reputation: 168588
Assuming it is a single text string then, in Oracle, you can use multiple recursive sub-query factoring clauses to split the string into substrings:
Oracle Setup:
CREATE TABLE table_name ( id, value ) AS
SELECT 1, '(1 JAN) INJECT 2ML (100MG) IV/IM AM THEN 0.5ML (25MG) 20 MIN LATER, THEN 2.5ML (125MG) PM'
|| '(SEP 20, 2018) INJECT 0.3ML (30MG) ON S1, 0.6ML (60MG) ON S2 AND 2ML(200MG) ON S3' FROM DUAL;
Query:
WITH datelines ( id, value, dt, pos, lvl ) AS (
SELECT id,
value,
REGEXP_SUBSTR(
value,
'\((([0-2]?\d|3[01]) (JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)|(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC) ([0-2]?\d|3[01]), \d{4})\)',
1,
1,
NULL,
1
),
REGEXP_INSTR(
value,
'\((([0-2]?\d|3[01]) (JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)|(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC) ([0-2]?\d|3[01]), \d{4})\)',
1,
1
),
1
FROM table_name
UNION ALL
SELECT id,
value,
REGEXP_SUBSTR(
value,
'\((([0-2]?\d|3[01]) (JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)|(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC) ([0-2]?\d|3[01]), \d{4})\)',
1,
LVL + 1,
NULL,
1
),
REGEXP_INSTR(
value,
'\((([0-2]?\d|3[01]) (JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)|(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC) ([0-2]?\d|3[01]), \d{4})\)',
1,
LVL + 1
),
LVL + 1
FROM datelines
WHERE REGEXP_SUBSTR(
value,
'\((([0-2]?\d|3[01]) (JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)|(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC) ([0-2]?\d|3[01]), \d{4})\)',
1,
LVL + 1,
NULL,
1
) IS NOT NULL
),
actions ( id, dt, lvl, actions ) AS (
SELECT id,
dt,
lvl,
SUBSTR(
value,
pos + LENGTH( dt ) + 2,
LEAD( pos, 1, LENGTH( value ) + 1 ) OVER ( PARTITION BY id ORDER BY lvl ) - pos - LENGTH( dt ) - 2
)
FROM datelines
),
amounts ( id, dt, lvl, actions, amount, num_amounts, amount_lvl ) AS (
SELECT id,
dt,
lvl,
actions,
TO_NUMBER( REGEXP_SUBSTR( actions, '\((\d+)MG\)', 1, 1, NULL, 1 ) ),
REGEXP_COUNT( actions, '\((\d+)MG\)' ),
1
FROM actions
UNION ALL
SELECT id,
dt,
lvl,
actions,
TO_NUMBER( REGEXP_SUBSTR( actions, '\((\d+)MG\)', 1, amount_lvl + 1, NULL, 1 ) ),
num_amounts,
amount_lvl + 1
FROM amounts
WHERE amount_lvl < num_amounts
)
SELECT id,
dt,
SUM( amount ) AS total_amount
FROM amounts
GROUP BY id, dt, lvl;
Output:
ID | DT | TOTAL_AMOUNT -: | :----------- | -----------: 1 | SEP 20, 2018 | 290 1 | 1 JAN | 250
db<>fiddle here
Update
If each line is in a different row of the database table then:
Oracle Setup:
CREATE TABLE table_name ( id, value ) AS
SELECT 1, '(1 JAN) INJECT 2ML (100MG) IV/IM AM THEN 0.5ML (25MG) 20 MIN LATER, THEN 2.5ML (125MG) PM' FROM DUAL UNION ALL
SELECT 2, '(SEP 20, 2018) INJECT 0.3ML (30MG) ON S1, 0.6ML (60MG) ON S2 AND 2ML(200MG) ON S3' FROM DUAL;
Query:
WITH amounts ( id, value, dt, amount, amount_index, num_amounts ) AS (
SELECT id,
value,
REGEXP_SUBSTR( value, '\((.*?)\)', 1, 1, NULL, 1 ),
TO_NUMBER( REGEXP_SUBSTR( value, '\((\d+)MG\)', 1, 1, NULL, 1 ) ),
1,
REGEXP_COUNT( value, '\((\d+)MG\)' )
FROM table_name
UNION ALL
SELECT id,
value,
dt,
TO_NUMBER( REGEXP_SUBSTR( value, '\((\d+)MG\)', 1, amount_index + 1, NULL, 1 ) ),
amount_index + 1,
num_amounts
FROM amounts
WHERE amount_index < num_amounts
)
SELECT id,
MAX( dt ) AS dt,
SUM( amount ) AS total_amount
FROM amounts
GROUP BY id;
Output:
ID | DT | TOTAL_AMOUNT -: | :----------- | -----------: 1 | 1 JAN | 250 2 | SEP 20, 2018 | 290
db<>fiddle here
Upvotes: 1
Reputation: 2065
This is fairly easy to do in Oracle. You could:
REGEXP_COUNT
to count the number of occurrences of MG values in each stringCONNECT BY
to create a row for each matchREGEXP_SUBSTR
to get each actual matchSomething like this:
WITH test_vals AS (
SELECT '(1 JAN) INJECT 2ML (100MG) IV/IM AM THEN 0.5ML (25MG) 20 MIN LATER, THEN 2.5ML (125MG) PM' AS drug_direction FROM dual
UNION ALL SELECT '(SEP 20, 2018) INJECT 0.3ML (30MG) ON S1, 0.6ML (60MG) ON S2 AND 2ML(200MG) ON S3' FROM dual
),
match_rows AS ( /* Get a row for each match */
SELECT DISTINCT
m.drug_direction,
LEVEL AS mg_occurrance_num
FROM test_vals m
CONNECT BY LEVEL <= REGEXP_COUNT(m.drug_direction, '((\d+\.)?\d+)MG') /* Count number of matches in each string */
)
SELECT r.drug_direction,
SUM(
TO_NUMBER(
REGEXP_SUBSTR(
r.drug_direction,
'((\d+\.)?\d+)MG',
1,
r.mg_occurrance_num, /* Search for this specific occurrance */
'',
1 /* Get first sub-group (the actual numeric value) */
)
)
) AS total_mg_value
FROM match_rows r
GROUP BY r.drug_direction
ORDER BY r.drug_direction
Note that this assumes all the values are in that exact format (a numeric value followed by the string 'MG').
Upvotes: 2