Reputation: 3
I need some help with writing a SAS code. I have a dataset that contains id numbers for people (variable name directorid), ids for company names where they worked (variable name companyid), their start date (variable name datestartrole), and the end date (variable name dateendrole) at each place they worked. Note that one person can have more than one jobs at a time. Also note that the same person might work for a given firm, leave that firm, and then come back and work for the same firm.
here are first few rows of the input data:
Obs directorid companyid datestartrole dateendrole
1 16 15486 1983-01-01 1993-10-01
2 16 15486 1993-10-01 1999-02-18
3 16 15486 1999-02-18 1999-07-22
4 16 15486 1999-07-22 2002-01-24
5 16 21616 2004-12-08 2007-02-23
6 16 1623243 2010-02-03 2014-06-03
7 16 2992276 2019-02-07 2019-09-04
8 16 2992276 2019-09-04 2020-05-13
9 18 26875 1989-02-01 2004-03-27
10 27 5251 1991-01-01 1995-12-31
I am trying to build a dataset that has directorid, year, and month (going from 1990 to march 2023), number of different companies each person have worked for as of a given month and the amount of experience(in employment months) they have had as of a given month.
The final dataset should look like this:
Obs directorid month year number_of_companies months_of_experience
Thanks in advance!
Upvotes: 0
Views: 39
Reputation: 146
I believe this will work for your purposes.
** Main source - one record per directorid per company per month worked **;
** Use m defined as 0=Jan1990, 1=Feb1990, ..., 407=Dec2023 for easier coding **;
data monthly (keep=directorid companyid m);
set have;
startm = 12 * (year(datestartrole) - 1990) + month(datestartrole) - 1;
endm = 12 * (year(dateendrole) - 1990) + month(dateendrole) - 1;
do m = startm to endm;
output;
end;
run;
** Identify first month worked for each company **;
proc sort data=monthly;
by directorid companyid m;
proc sort data=monthly out=firstwork (rename=(m=firstm)) nodupkey;
by directorid companyid;
proc sort data=firstwork;
by directorid firstm;
data shell_numcompany (keep=directorid m number_of_companies);
set firstwork;
by directorid firstm;
retain prevm number_of_companies;
if first.directorid then do;
number_of_companies = 0;
prevm = -1;
end;
if first.firstm then do;
** Output records with current number_of_companies before updating **;
** Do not need to do this if months are contiguous **;
if (firstm - prevm) > 1 then do;
do m = prevm + 1 to firstm - 1; output; end;
end;
end;
** Increment counter (new company started this month) **;
number_of_companies + 1;
** If last company started at this month, output the new record **;
if last.firstm then do;
m = firstm;
output;
** Set prevm to current month **;
prevm = m;
end;
** If last record for person, output all remaining months with final total **;
if last.directorid then do;
if prevm < 407 then do m = prevm + 1 to 407; output; end;
end;
run;
** Get cumulative number of months worked at each month **;
proc sort data=monthly; by directorid m;
data shell_nummonths (keep=directorid m months_worked);
set monthly (rename=(m=tempm));
by directorid tempm;
** Only need max one record per month worked **:
if first.tempm;
retain months_worked prevm;
if first.directorid then do;
months_worked = 0;
prevm = -1;
end;
** Output months before this record at the previous value **;
if (tempm - prevm) > 1 then do m = prevm + 1 to tempm - 1; output; end;
months_worked + 1;
if last.tempm then do;
m = tempm;
output;
prevm = m;
end;
** If last record before end, fill the rest with the current value **;
if last.directorid then do;
if prevm < 407 then do m = prevm + 1 to 407; output; end;
end;
run;
** Merge back together and reconstruct month/year **;
data want (drop=m);
merge shell_numcompany (in=in1)
shell_nummonths (in=in2)
;
by directorid m;
if ^(in1 & in2) then abort;
month = mod(m,12) + 1;
year = floor(m/12);
run;
Upvotes: 0
Reputation: 3
Partial Answer that gets me the number of companies:
data unique_directorid;
set have;
by directorid;
if first.directorid;
run;
data all_months;
format yearmon date9.;
do year = 1990 to 2023;
do month = 1 to 12;
yearmon = mdy(month, 1, year);
if yearmon <= '31MAR2023'd then output;
end;
end;
run;
proc sql;
create table director_dates as
select a.directorid, b.year, b.month
from unique_directorid a
cross join all_months b;
quit;
data director_dates; format dateval date9.; set director_dates; dateval=intnx("month",mdy(month,1,year),0,"end"); run;
proc sort data=director_dates; by directorid dateval; run;
/*get count of number of firms*/
proc sql;
create table exp1 as
select a.*,count(distinct b.companyid) as numfirms
from director_dates as a left outer join emptemp as b
on a.directorid=b.directorid and a.dateval>=b.datestartrole
group by a.directorid,dateval;
quit;
proc sort data=exp1 nodup; by directorid dateval; run;
Upvotes: 0
Reputation: 1394
data have;
input directorid$ companyid$ datestartrole dateendrole;
informat datestartrole dateendrole yymmdd10.;
format directorid companyid $8. datestartrole dateendrole yymmdd10.;
cards;
16 15486 1983-01-01 1993-10-01
16 15486 1993-10-01 1999-02-18
16 15486 1999-02-18 1999-07-22
16 15486 1999-07-22 2002-01-24
16 21616 2004-12-08 2007-02-23
16 1623243 2010-02-03 2014-06-03
16 2992276 2019-02-07 2019-09-04
16 2992276 2019-09-04 2020-05-13
18 26875 1989-02-01 2004-03-27
27 5251 1991-01-01 1995-12-31
;
run;
*1. Expand the raw data: Each year, month, company has one record;
data temp1;
set have;
do date=datestartrole to dateendrole;
month=month(date);
year=year(date);
output;
end;
run;
proc sort data=temp1 out=temp2 nodupkey;
by directorid year month companyid;
run;
*2. Compute number of companies monthly, compute months of experience for each director;
data want(where=(1990<=year<=2022 or (year=2023 and month<=3)));
set temp2;
by directorid year month;
if first.month then call missing(number_of_companies);
if first.directorid then call missing(months_of_experience);
number_of_companies+1;
months_of_experience+1;
if last.month;
keep directorid month year number_of_companies months_of_experience;
run;
Upvotes: 0