*! 26jun2011 program codescan version 11.1 preserve mata: DoIt() end version 11.1 mata: class cscanner { //>>def class<< string scalar name real scalar offset1, offset2 real vector returns pointer vector varpat, codepat string vector codepar real vector eq, start, stop, count, first, last static vector ptids static string matrix vardefs, filedefs, archfiles static real matrix dr, fscgrid void globinit() string scalar init1() void coderow() void initf() string matrix fin() void match() void update() } void cscanner::globinit(string scalar idvar, string scalar af, string scalar config) { //>>def member<< class dataset scalar ds if (st_isnumvar(idvar)) ptids=st_data(.,idvar) else ptids=st_sdata(.,idvar) if (config!="") config=pcanon(config,"dir") ds.read("data",config+"codescan files.dta") filedefs=ds.strdat(.,.) ds.read("data",config+"codescan vars.dta") vardefs=ds.strdat(.,.) for (r=1;r<=rows(filedefs);r++) vardefs[,4]=subinstr(vardefs[,4],filedefs[r,1],filedefs[r,2]) parts=columnize(af,";") archfiles=J(0,1,"") for (p=1;p<=length(parts);p++) archfiles=archfiles\multipath(parts[p],".dta") archfiles=uniqrows(archfiles) //if (length(archfiles)==0) errel(sprintf("No files found: %s\n",concat(parts,";"))) archfiles=archfiles,J(rows(archfiles),5,"") for (f=1;f<=rows(archfiles);f++) { if (length(fd=toindices(vvmap(filedefs[,2],varlist("",archfiles[f,1])',1)))==1) { archfiles[f,2..5]=filedefs[fd,2..5] } } dr=J(rows(archfiles),1,(.,c("minlong"))) fscgrid=J(rows(archfiles),0,.) } string scalar cscanner::init1(string scalar n, real scalar off1, real scalar off2, real vector rets, real scalar coderows) { //>>def member<< name=strtrim(n) offset1=off1 offset2=off2 returns=rets varpat=codepat=J(coderows,1,NULL) eq=J(coderows,1,.) codepar=J(coderows,1,"") return(name) } void cscanner::coderow(real scalar r, string scalar var, real scalar comp, string scalar code) { //>>def member<< varpat[r]=&var eq[r]=comp codepat[r]=&code } void cscanner::initf(transmorphic vector dateview) { //>>def member<< printf("Setting up %s\n",name) if (length(exist=varlist(concat(name:+("_flag","_count","_first","_last")," ")))) errel(concat(exist," ")+" already exist(s)") start=dateview:+offset1 //dfrom stop=dateview:+offset2 //dto count=J(st_nobs(),1,0) //count first=J(st_nobs(),1,.) //dfirst last=J(st_nobs(),1,c("minlong")) //dlast pickers="" allvars=J(0,1,"") for (r=1;r<=rows(varpat);r++) { varmatch=listmatch(vardefs[,1],*varpat[r]) //potential issues resolving (nonsensical) ranges codetype=uniqrows(select(vardefs[,2..3],varmatch)) if (rows(codetype)>1) errel("A varlist for "+name+" refers to more than one type of code.") varpat[r]=&select(vardefs[,1],varmatch) //colvector ok? if (codetype[1]=="icd9") codepat[r]=®ex_of_icd9(*codepat[r],codetype[2]) else if (codetype[1]=="num") codepat[r]=®ex_of_num(*codepat[r],codetype[2]) else errel("Unknown code type (not icd9 or num)") codepar[r]=codetype[2] //only width is used... pickers=pickers+" "+concat(select(vardefs[,4],varmatch)," ") allvars=allvars\*varpat[r] } fscgrid=fscgrid,listmatch(archfiles[,2],pickers) fix=toindices(fscgrid[,cols(fscgrid)]) archfiles[fix,6]=archfiles[fix,6]:+concat(uniqrows(allvars)," ") dr[fix,1]=rowmin((dr[fix,1],J(rows(fix),1,min(start)))) dr[fix,2]=rowmax((dr[fix,2],J(rows(fix),1,max(stop)))) } string matrix cscanner::fin() { //>>def member<< archfiles=select(archfiles,rowsum(fscgrid)) x=J(rows(archfiles),3,"") x[,1]=archfiles[,1] for (r=1;r<=rows(x);r++) x[r,2]=concat(uniqrows(columnize(archfiles[r,3..6]," ")')," ") //this may be superfluous now x[,3]="if ":+archfiles[,5]:+">=":+strofreal(dr[,1]):+"&":+archfiles[,4]:+"<=":+strofreal(dr[,2]) return(x) } void cscanner::match(filerow) { //>>def member<< V=. hits=J(st_nobs(),1,1) for (r=1;r<=rows(varpat);r++) { hit=J(st_nobs(),1,0) for (s=1;s<=length(*varpat[r]);s++) { //each of these should be one var, now if (st_isstrvar((*varpat[r])[s])) st_sview(V,.,(*varpat[r])[s]) else { st_view(V,.,(*varpat[r])[s]) V=strofreal(V,printf("%%0%f.0f",codepar[r])) //only use of codepar... } hit=hit:|rowmax(regexm(V,*codepat[r])) } hits=eq[r]?(hits:&hit):(hits:&!hit) } varview(ids=.,toindices(hits),archfiles[filerow,3]) st_view(V=.,toindices(hits),archfiles[filerow,4..5]) for (r=1;r<=rows(V);r++) { match=toindices(ids[r]:==ptids:&V[r,2]:>=start:&V[r,1]:<=stop) count[match]=count[match]:+1 if (returns[3]) first[match]=rowmin((first[match],J(rows(match),1,V[r,1]))) if (returns[4]) last[match]=rowmax((last[match],J(rows(match),1,V[r,2]))) } } void cscanner::update() { //>>def member<< dbits=(&(count:>0),&count,&first,&last) if (returns[4]) _editvalue(last,c("minlong"),.) for (c=1;c<=4;c++) { if (returns[c]) { vtype=("byte","int","long","long")[c] //pick these better vname=name+("_flag","_count","_first","_last")[c] (void) st_addvar(vtype,vname) if (c>2) stata(sprintf("format %%td %s",vname)) st_store(.,vname,*dbits[c]) } } } void DoIt() { //>>def function<< class dataset scalar ds class sequser scalar su class cscanner vector scnrs syntaxl(st_local("0"),(&(spec="[anything]"),&(use="[using]")),(&(datevar="date:var()"),&(idvar="[id:var()]"),&(archfiles="files()"), &(dayrange="[day:range()]"),&(glfl="[flag]"),&(glc="[count]"),&(glf="[first]"),&(gll="[last]"),&(config="[conf:igpath()]"))) if (spec==""&use=="") errel("No scan parameters provided") if (strlen(spec) & strlen(use)) errel("Scan parameters can be provided directly, or through a {it:using} file, but not both") if (idvar=="") idvar="scrssn" globdays=strtoreal(columnize(dayrange,"/")) st_view(dateview=.,.,datevar) if (strlen(use)) { //create from dataset (no global settings) ds.read("data",use) scnrs=cscanner(ds.nobs)' scsort=J(rows(scnrs),1,"") scnrs[1].globinit(idvar,archfiles,config) for (r=1;r<=length(scnrs);r++) { coderows=subvec(toindices(strlen(ds.inout(r,ds.varlist("codes*")))),(-1,-1)) scsort[r]=scnrs[r].init1(ds.inout(r,1),ds.inout(r,2),ds.inout(r,3),ds.inout(r,4..7),coderows) for (c=0;c0) { for (sc=1;sc<=rows(scnrs);++sc) if (scnrs[1].fscgrid[used[1],sc]) scnrs[sc].match(used[1]) } } while (go) stata("restore") for (r=1;r