Reputation: 4388
I am getting a string in the below format after reading data from a csv file
v_lastline = '29218368,8062115," Benedict Canyon Equities, Inc",CLS,,FAX';
I just want to convert it into an array while will contain 6 values, the comma before the , Inc
needs to be escaped.
Can any one please suggest whats the best way to do it in PL/SQL?
Upvotes: 0
Views: 3693
Reputation: 191235
This is similar to this question, but you have empty elements in your list; and a simple translation of one of the patterns I tried there skips those:
var v_lastline varchar2(50);
exec :v_lastline := '29218368,8062115," Benedict Canyon Equities, Inc",CLS,,FAX';
select level as lvl,
regexp_substr(:v_lastline, '("[^"]*"|[^,]+)', 1, level) as element
from dual
connect by level <= regexp_count(:v_lastline, '("[^"]*"|[^,]+)');
LVL ELEMENT
---------- ----------------------------------------
1 29218368
2 8062115
3 " Benedict Canyon Equities, Inc"
4 CLS
5 FAX
If you can identify a special character that will never appear in the data then you can work around that by putting that into the empty elements by changing every comma to comma+character, and then removing it after the split:
select level as lvl,
replace(regexp_substr(replace(:v_lastline, ',', ',§'),
'(§"[^"]*"|[^,]+)', 1, level), '§', null) as element
from dual
connect by regexp_substr(replace(:v_lastline, ',', ',§'),
'(§"[^"]*"|[^,]+)', 1, level) is not null;
LVL ELEMENT
---------- ----------------------------------------
1 29218368
2 8062115
3 " Benedict Canyon Equities, Inc"
4 CLS
5
6 FAX
It's an extension of a common method to split delimited strings, which is explained in detail here.
replace(:v_lastline, ',', ',§')
changes ...,CLS,,FAX
to ...,§CLS,§,§FAX
, where §
is a character you'll never see.regexp_substr(..., '(§"[^"]*"|[^,]+)', 1, level)
tokenises the updated value with a regex that looks for any double-quote-enclosed value (now preceded by the special character too) or a non-comma; the order of the evaluation means commas inside the quoted part are ignored.level
is part of the hierarchical query syntax, where:connect by regexp_substr(<same value and pattern>) is not null
just figured out how many tokens there are.replace(regexp_substr(...), , '§', null)
removes the special character used in the first step.You can then remove the double-quotes too with a further level of replace()
, and trim whitespace, if you want/need to.
You have't said quite what you mean by an array, but you can run that query in PL/SQL and bulk-collect into a collection if that's what you intend to work with. For example, using the built-in ODCIVARCHAR2LIST collection type:
set serveroutput on
declare
v_lastline varchar2(50);
v_array sys.odcivarchar2list;
begin
v_lastline := '29218368,8062115," Benedict Canyon Equities, Inc",CLS,,FAX';
select trim(replace(replace(
regexp_substr(replace(:v_lastline, ',', ',§'),
'(§"[^"]*"|[^,]+)', 1, level), '§', null), '"', null))
bulk collect into v_array
from dual
connect by regexp_substr(replace(:v_lastline, ',', ',§'),
'(§"[^"]*"|[^,]+)', 1, level) is not null;
dbms_output.put_line('Number of elements: ' || v_array.count);
for i in 1..v_array.count loop
dbms_output.put_line('Index ' || i || ' has: ' || v_array(i));
end loop;
end;
/
Number of elements: 6
Index 1 has: 29218368
Index 2 has: 8062115
Index 3 has: Benedict Canyon Equities, Inc
Index 4 has: CLS
Index 5 has:
Index 6 has: FAX
With multiple empty elements this also (now) works:
exec :v_lastline := '29218368,8062115," Benedict Canyon Equities, Inc",,,,,,,CLS,,,,,FAX,,,,,,,,,,,,,,,,,,INVOICE';
select level as lvl,
replace(regexp_substr(replace(:v_lastline, ',', ',§'),
'(§"[^"]*"|[^,]+)', 1, level), '§', null) as element
from dual
connect by regexp_substr(replace(:v_lastline, ',', ',§'),
'(§"[^"]*"|[^,]+)', 1, level) is not null;
LVL ELEMENT
---------- ----------------------------------------
1 29218368
2 8062115
3 " Benedict Canyon Equities, Inc"
4
...
9
10 CLS
11
...
14
15 FAX
16
...
32
33 INVOICE
Upvotes: 3
Reputation: 2043
Here is a solution without regular expressions, first of all create two helper functions
/* CAR select car('hello,world,bla') from dual --> hello */
create or replace function car(PI_STR in varchar2,
PI_SEPARATOR in varchar2 default ',')
return varchar2 is l_pos number;
begin
l_pos := instr(PI_STR, PI_SEPARATOR);
if l_pos > 0 then
return substr(PI_STR, 1, l_pos - 1);
end if;
return PI_STR;
end;
/* CDR select cdr('hello,world,bla') from dual --> world,bla */
create or replace function cdr(PI_STR in varchar2,
PI_SEPARATOR in varchar2 default ',')
return varchar2 is l_pos number;
begin
l_pos := instr(PI_STR, PI_SEPARATOR);
if l_pos > 0 then
return substr(PI_STR, l_pos + length(PI_SEPARATOR));
end if;
return '';
end;
now: extract by ',' and for each result concat with next entry if escape character is found upto next escape character:
create or replace type csv_col is table of varchar2(4000);
create or replace function get_columns(PI_STR in varchar2,
PI_SEPARATOR in varchar2,
PI_ESC_CHAR in varchar2)
return csv_col pipelined is l_car varchar2(4000);
l_cdr varchar2(4000);
l_car_esc varchar2(4000);
begin
l_car := car(PI_STR, PI_SEPARATOR);
l_cdr := cdr(PI_STR, PI_SEPARATOR);
-- check for escape char
l_car_esc := cdr(l_car, PI_ESC_CHAR);
if l_car_esc is not null then
l_car := l_car_esc || PI_SEPARATOR || car(l_cdr, PI_ESC_CHAR);
l_cdr := cdr(cdr(l_cdr, PI_ESC_CHAR), PI_SEPARATOR);
end if;
loop
if l_car is null and l_cdr is null then
exit;
end if;
pipe row(l_car);
l_car := car(l_cdr, PI_SEPARATOR);
l_cdr := cdr(l_cdr, PI_SEPARATOR);
l_car_esc := cdr(l_car, PI_ESC_CHAR);
if l_car_esc is not null then
l_car := l_car_esc || PI_SEPARATOR || car(l_cdr, PI_ESC_CHAR);
l_cdr := cdr(cdr(l_cdr, PI_ESC_CHAR), PI_SEPARATOR);
dbms_output.put_line(l_car);
dbms_output.put_line(l_cdr);
end if;
end loop;
end;
call it like this:
select *
from table(get_columns('29218368,8062115," Benedict Canyon Equities, Inc",CLS,,FAX',
',',
'"'));
--> result
29218368
8062115
Benedict Canyon Equities, Inc
CLS
FAX
Upvotes: 0
Reputation: 22949
If the structure of your CSV if fixed, you can try with something like this:
with text(text) as ( select '29218368,8062115," Benedict Canyon Equities, Inc",CLS,,FAX' from dual)
select level,
trim(',' from
case
when level in (1,2) then
regexp_substr(text, '(.*??)\,', 1, level)
when level = 3 then
regexp_substr(text, '"(.*??)"', 1, 1)
when level in (4,5) then
regexp_substr(text, '(.*??)\,', instr(text, '"', 1, 2), level -2)
when level = 6 then
regexp_substr(text, '\,([^\,]*)', instr(text, '"', 1, 2), 3)
end
)
from text
connect by level <= 6
This makes strong assumptions on the structure of CSV, by treating each part in a different way, but it seems to me difficult to find a really generic solution to the problem.
Upvotes: 0