Reputation: 157
Supose we've got the following dataset:
DATE VAR1 VAR2
1 A 1
2 A 1
3 B 1
4 C 2
5 D 3
6 E 4
7 F 5
8 B 6
9 B 7
10 D 1
Each record belongs to a person, the problem is that a single person can have more than one record with different values.
To identify a person: If you share the same VAR1, you are the same person, BUT also if you share the same VAR2, you are the same person.
My objective is to create a new variable IDPERSON which uniquely identifies the person for each record. In my example, there are only 4 different people:
DATE VAR1 VAR2 IDPERSON
1 A 1 1
2 A 1 1
3 B 1 1
4 C 2 2
5 D 3 1
6 E 4 3
7 F 5 4
8 B 6 1
9 B 7 1
10 D 1 1
How could I achieve this by using SQL or SAS?
Upvotes: 2
Views: 340
Reputation: 157
I forgot to post my final solution, it is a SAS macro. I've made another one for 3 variables.
%MACRO GROUPER2(INDATA,OUTDATA,ID1,ID2,IDOUT,IDN=_N_,MAXN=5);
%PUT ****************************************************************;
%PUT ****************************************************************;
%PUT **** GROUPER MACRO;
%PUT **** PARAMETERS:;
%PUT **** INPUT DATA: &INDATA.;
%PUT **** OUTPUT DATA: &OUTDATA.;
%PUT **** FIRST VARIABLE: &ID1.;
%PUT **** SECOND VARIABLE: &ID2.;
%PUT **** OUTPUT GROUPING VARIABLE: &IDOUT.;
%IF (&IDN.=_N_) %THEN %PUT **** STARTING NUMBER VARIABLE: AUTONUMBER;
%ELSE %PUT **** STARTING NUMBER VARIABLE: &IDN.;
%PUT **** MAX ITERATIONS: &MAXN.;
%PUT ****************************************************************;
%PUT ****************************************************************;
/* CREATE FIRST GUESS FOR GROUP ID */
DATA _G_TEMP1 _G_TEMP2;
SET &INDATA.;
&IDOUT.=&IDN.;
IF &IDOUT.=. THEN OUTPUT _G_TEMP2;
ELSE OUTPUT _G_TEMP1;
RUN;
PROC SQL NOPRINT;
SELECT MAX(&IDOUT.) INTO :MAXIDOUT FROM _G_TEMP1;
QUIT;
DATA _G_TEMP2;
SET _G_TEMP2;
&IDOUT.=_N_+&MAXIDOUT.;
RUN;
DATA _G_TEMP;
SET _G_TEMP1 _G_TEMP2;
RUN;
PROC SQL;
UPDATE _G_TEMP SET &IDOUT.=. WHERE &ID1. IS NULL AND &ID2. IS NULL;
QUIT;
/* LOOP, IMPROVE GROUP ID EACH TIME*/
%LET I = 1;
%DO %WHILE (&I. <= &MAXN.);
%PUT LOOP NUMBER &I.;
%LET I = %EVAL(&I. + 1);
PROC SQL NOPRINT;
/* FIND THE LOWEST GROUP ID FOR EACH GROUP OF FIRST VARIABLE */
CREATE TABLE _G_MAP1 AS SELECT MIN(&IDOUT.) AS &IDOUT., &ID1. FROM _G_TEMP WHERE &ID1. IS NOT NULL GROUP BY &ID1.;
/* FIND THE LOWEST GROUP ID FOR EACH GROUP OF SECOND VARIABLE */
CREATE TABLE _G_MAP2 AS SELECT MIN(&IDOUT.) AS &IDOUT., &ID2. FROM _G_TEMP WHERE &ID2. IS NOT NULL GROUP BY &ID2.;
/* FIND THE LOWEST GROUP ID FROM BOTH GROUPING VARIABLES */
CREATE TABLE _G_NEW AS SELECT A.&ID1., A.&ID2., COALESCE(MIN(B.&IDOUT., C.&IDOUT.), A.&IDOUT.) AS &IDOUT.,
A.&IDOUT. AS &IDOUT._OLD FROM _G_TEMP AS A FULL OUTER JOIN _G_MAP1 AS B ON A.&ID1. = B.&ID1.
FULL OUTER JOIN _G_MAP2 AS C ON A.&ID2. = C.&ID2.;
/* PUT RESULTS INTO TEMPORARY DATASET READY FOR NEXT ITTERATION */
CREATE TABLE _G_TEMP AS SELECT * FROM _G_NEW ORDER BY &ID1., &ID2.;
/* CHECK IF THE ITTERATION PROVIDED ANY IMPROVEMENT */
SELECT MIN(CASE WHEN &IDOUT._OLD = &IDOUT. THEN 1 ELSE 0 END) INTO :STOPFLAG FROM _G_TEMP;
%PUT NO IMPROVEMENT? &STOPFLAG.;
QUIT;
/* END LOOP IF ID UNCHANGED OVER LAST ITTERATION */
%LET ITERATIONS=%EVAL(&I. - 1);
%IF &STOPFLAG. %THEN %LET I = %EVAL(&MAXN. + 1);
%END;
%PUT ****************************************************************;
%PUT ****************************************************************;
%IF &STOPFLAG. %THEN %PUT **** LOOPING ENDED BY NO-IMPROVEMENT CRITERIA. OUTPUT FULLY GROUPED.;
%ELSE %PUT **** WARNING: LOOPING ENDED BY REACHING THE MAXIMUM NUMBER OF ITERARIONS. OUTPUT NOT FULLY GROUPED.;
%PUT **** NUMBER OF ITERATIONS: &ITERATIONS. (MAX: &MAXN.);
%PUT ****************************************************************;
%PUT ****************************************************************;
DATA &OUTDATA.;
SET _G_TEMP;
DROP &IDOUT._OLD;
RUN;
/* OUTPUT LOOKUP TABLE */
PROC SQL;
CREATE TABLE &OUTDATA._1 AS SELECT &ID1., MIN(&IDOUT.) AS &IDOUT. FROM _G_TEMP WHERE &ID1. IS NOT NULL GROUP BY &ID1. ORDER BY &ID1.;
CREATE TABLE &OUTDATA._2 AS SELECT &ID2., MIN(&IDOUT.) AS &IDOUT. FROM _G_TEMP WHERE &ID2. IS NOT NULL GROUP BY &ID2. ORDER BY &ID2.;
QUIT;
/* CLEAN UP */
PROC DATASETS NOLIST;
DELETE _G_:;
QUIT;
%MEND GROUPER2;
Upvotes: 0
Reputation: 26
%macro grouper(
inData /*Input dataset*/,
outData /*output dataset*/,
id1 /*First identification variable (must be numeric)*/,
id2 /*Second identification variable*/,
idOut /*Name of variable to contain group ID*/,
maxN = 5 /*Max number of itterations in case of failure*/);
/* Assign an ID to each distict connected graph in a a network */
/* Create first guess for group ID */
data _g_temp;
set &inData.;
&idOut. = &id1.;
run;
/* Loop, improve group ID each time*/
%let i = 1;
%do %while (&i. <= &maxN.);
%put Loop number &i.;
%let i = %eval(&i. + 1);
proc sql noprint;
/* Find the lowest group ID for each group of first variable */
create table _g_map1 as
select
min(&idOut.) as &idOut.,
&id1.
from _g_temp
group by &id1.;
/* Find the lowest group ID for each group of second variable */
create table _g_map2 as
select
min(&idOut.) as &idOut.,
&id2.
from _g_temp
group by &id2.;
/* Find the lowest group ID from both grouping variables */
create table _g_new as
select
a.&id1.,
a.&id2.,
coalesce(min(b.&idOut., c.&idOut.), a.&idOut.) as &idOut.,
a.&idOut. as &idOut._old
from _g_temp as a
full outer join _g_map1 as b
on a.&id1. = b.&id1.
full outer join _g_map2 as c
on a.&id2. = c.&id2.;
/* Put results into temporary dataset ready for next itteration */
create table _g_temp as
select *
from _g_new;
/* Check if the itteration provided any improvement */
select
min(
case when &idOut._old = &idOut. then 1
else 0
end) into :stopFlag
from _g_temp;
quit;
/* End loop if ID unchanged over last itteration */
%if &stopFlag. %then %let i = %eval(&maxN. + 1);
%end;
/* Output lookup table */
proc sql;
create table &outData. as
select
&id1.,
min(&idOut.) as &idOut.
from _g_temp
group by &id1.;
quit;
/* Clean up */
proc datasets nolist;
delete _g_:;
quit;
%mend grouper;
DATA baseData;
INPUT VAR1 VAR2 $;
CARDS;
1 A
1 A
1 B
2 C
3 D
4 E
5 F
6 B
7 B
1 D
1 X
7 G
6 Y
6 D
6 I
8 D
9 Z
9 X
;
RUN;
%grouper(
baseData,
outData,
VAR1,
VAR2,
groupID);
Upvotes: 1
Reputation: 157
Keith:
You solution does not work properly, take a look at the following dataset:
DATA TEMP3;
INPUT VAR2 VAR1 $ DATE;
DUMMY=1;
CARDS;
1 A 1
1 A 2
1 B 3
2 C 4
3 D 5
4 E 6
5 F 7
6 B 8
7 B 9
1 D 10
1 X 11
7 G 14
6 Y 15
6 D 16
6 I 18
8 D 20
9 Z 21
9 X 22
;
RUN;
Your program's result is:
VAR2 VAR1 DATE DUMMY idperson
1 A 1 1 1
1 A 2 1 1
1 B 3 1 1
2 C 4 1 2
3 D 5 1 1
4 E 6 1 3
5 F 7 1 4
6 B 8 1 1
7 B 9 1 1
1 D 10 1 1
1 X 11 1 1
7 G 14 1 6
6 Y 15 1 5
6 D 16 1 1
6 I 18 1 5
8 D 20 1 1
9 Z 21 1 7
9 X 22 1 1
Which are not corrent since Var1=6 records have two different ids.
This is what i've done, the whole program (not posted here) is more complex (and not so elegant) since it deals with missing data in Var1 and Var2.
PROC SQL;
CREATE TABLE WORK.TEMP4 AS SELECT DISTINCT VAR1, VAR2 FROM WORK.TEMP3 WHERE DUMMY=1 AND VAR2^=. ORDER BY VAR1, VAR2;
CREATE TABLE WORK.TEMP5 AS SELECT DISTINCT VAR2, VAR1 FROM WORK.TEMP3 WHERE DUMMY=1 AND VAR2^=. ORDER BY VAR2, VAR1;
CREATE TABLE WORK.TEMP6 AS SELECT TEMP4.*, TEMP5.VAR1 AS CIP2 FROM WORK.TEMP4 INNER JOIN WORK.TEMP5 ON (TEMP4.VAR2=TEMP5.VAR2);
CREATE TABLE WORK.TEMP7 AS SELECT TEMP6.*, TEMP4.VAR2 AS IDHH2 FROM WORK.TEMP6 INNER JOIN WORK.TEMP4 ON (TEMP6.VAR1=TEMP4.VAR1);
CREATE TABLE WORK.TEMP8 AS SELECT DISTINCT IDHH2, CIP2 FROM WORK.TEMP7;
CREATE TABLE WORK.TEMP9 AS SELECT TEMP7.*, TEMP8.CIP2 AS CIP3 FROM WORK.TEMP7 INNER JOIN WORK.TEMP8 ON (TEMP7.IDHH2=TEMP8.IDHH2);
CREATE TABLE WORK.TEMP10 AS SELECT TEMP9.*, TEMP8.IDHH2 AS IDHH3 FROM WORK.TEMP9 INNER JOIN WORK.TEMP8 ON (TEMP9.CIP3=TEMP8.CIP2);
CREATE TABLE WORK.TEMP11 AS SELECT DISTINCT VAR1, IDHH3 AS VAR2 FROM WORK.TEMP10 ORDER BY VAR1, IDHH3;
CREATE TABLE WORK.TEMP12 AS SELECT VAR1, MAX(VAR2) AS VAR2 FROM WORK.TEMP11 GROUP BY VAR1;
CREATE TABLE WORK.TEMP13 AS SELECT TEMP11.* FROM WORK.TEMP11 INNER JOIN WORK.TEMP12 ON (TEMP11.VAR1=TEMP12.VAR1 AND TEMP11.VAR2=TEMP12.VAR2);
CREATE TABLE WORK.TEMP14 AS SELECT TEMP3.*, TEMP13.VAR2 AS IDPERSONA FROM WORK.TEMP3 LEFT JOIN WORK.TEMP13 ON (TEMP3.VAR1=TEMP13.VAR1);
CREATE TABLE WORK.TEMP15 AS SELECT DISTINCT VAR2, IDPERSONA FROM WORK.TEMP14 WHERE VAR2^=. AND IDPERSONA^=.;
CREATE TABLE WORK.TEMP16 AS SELECT TEMP14.*, TEMP15.IDPERSONA AS IDPERSONA2 FROM WORK.TEMP14 LEFT JOIN WORK.TEMP15 ON (TEMP14.VAR2=TEMP15.VAR2) ORDER BY DATE;
QUIT;
DATA TEMP16;
SET TEMP16;
IF IDPERSONA=. THEN IDPERSONA=IDPERSONA2;
DROP IDPERSONA2;
RUN;
And the right results:
VAR2 VAR1 DATE DUMMY IDPERSONA
1 A 1 1 9
1 A 2 1 9
1 B 3 1 9
2 C 4 1 2
3 D 5 1 9
4 E 6 1 4
5 F 7 1 5
6 B 8 1 9
7 B 9 1 9
1 D 10 1 9
1 X 11 1 9
7 G 14 1 9
6 Y 15 1 9
6 D 16 1 9
6 I 18 1 9
8 D 20 1 9
9 Z 21 1 9
9 X 22 1 9
Upvotes: 0
Reputation: 7602
I've broken down this problem into a few steps, which works for the data you've supplied. There's probably a way to reduce the number of steps, at the expense of readability. Let me know if this works for your real data.
/* create input dataset */
data have;
input DATE VAR1 $ VAR2;
datalines;
1 A 1
2 A 1
3 B 1
4 C 2
5 D 3
6 E 4
7 F 5
8 B 6
9 B 7
10 D 1
;
run;
/* calculate min VAR2 per VAR1 */
proc summary data=have nway idmin;
class var1;
output out=minvar2 (drop=_:) min(var2)=temp_var;
run;
/* add in min VAR2 data */
proc sql;
create table temp1 as select
a.*,
b.temp_var
from have as a
inner join
minvar2 as b
on a.var1 = b.var1
order by b.temp_var;
quit;
/* create idperson variable */
data want;
set temp1;
by temp_var;
if first.temp_var then idperson+1;
drop temp_var;
run;
/* sort back to original order */
proc sort data=want;
by date var1;
run;
Upvotes: 0
Reputation: 157
Do you think this will work?
It's written in SAS, but it uses SQL sentences.
DATA TEMP3;
INPUT VAR1 VAR2 $ DATE;
CARDS;
1 A 1
1 A 2
1 B 3
2 C 4
3 D 5
4 E 6
5 F 7
6 B 8
7 B 9
1 D 10
;
RUN;
PROC SQL;
CREATE TABLE WORK.TEMP4 AS SELECT DISTINCT VAR2, VAR1 FROM WORK.TEMP3 ORDER BY VAR2, VAR1;
CREATE TABLE WORK.TEMP5 AS SELECT DISTINCT VAR1, VAR2 FROM WORK.TEMP3 ORDER BY VAR1, VAR2;
CREATE TABLE WORK.TEMP6 AS SELECT TEMP4.VAR2, TEMP4.VAR1, TEMP5.VAR2 AS VAR22 FROM WORK.TEMP4 INNER JOIN WORK.TEMP5 ON (TEMP4.VAR1=TEMP5.VAR1);
CREATE TABLE WORK.TEMP7 AS SELECT TEMP6.*, TEMP5.VAR1 AS VAR12 FROM WORK.TEMP6 INNER JOIN WORK.TEMP5 ON (TEMP6.VAR2=TEMP5.VAR2);
CREATE TABLE WORK.TEMP8 AS SELECT DISTINCT VAR22, VAR12 FROM WORK.TEMP7 ORDER BY VAR22, VAR12;
CREATE TABLE WORK.TEMP9 AS SELECT VAR22, MAX(VAR12) AS VAR12 FROM WORK.TEMP8 GROUP BY VAR22;
CREATE TABLE WORK.TEMP10 AS SELECT TEMP8.* FROM WORK.TEMP8 INNER JOIN WORK.TEMP9 ON (TEMP8.VAR22=TEMP9.VAR22 AND TEMP8.VAR12=TEMP9.VAR12);
CREATE TABLE WORK.TEMP11 AS SELECT TEMP3.*, TEMP10.VAR12 AS IDPERSONA FROM WORK.TEMP3 LEFT JOIN WORK.TEMP10 ON (TEMP3.VAR2=TEMP10.VAR22);
QUIT;
Upvotes: 0