Reputation: 97
This is something that I can do easily in Excel. But Im being confounded by R.
I would like to assign country names to a long list of strings ("affiliation").
c("Department of Psychiatry and Behavioural Sciences, University College London Medical School, UK.",
"", "Ty Dewi Sant School of Nursing, University Hospital of Wales, College of Medicine, Cardiff.",
"University of Massachusetts Medical Center.", "Older Women's League.",
"Kimberly Quality Care, Boston, MA.", "Michaux Manor Living Center, Fayetteville, PA.",
"Florida Diagnostic and Learning Resources System, University of South Florida, Tampa 33613.",
"", "Bigel Institute for Health Policy, Brandeis University, Waltham, MA.",
"", "York Health Authority.", "Southern Illinois University, Edwardsville.",
"St. Joseph's Hospital, Memphis, TN.", "Long Term Home Care of the Frail Elderly Foundation, New York City.",
"Catholic University of America, Washington, DC.", "Mercy Health Center, Oklahoma City, OK.",
"", "Visiting Nurse Service of New York.", "RespiteCare Center, Evanston, IL.",
"Camden and Islington HA.", "National Advisory Council on Aging.",
"Visiting Nurse Service of New York.", "American Health Care Association, Washington, DC.",
"HealthCare Partners Medical Group, Los Angeles, CA 90015, USA.",
"Tad Publishing Company, Peoria, IL, USA.", "Child Health Investment Partnership, Roanoke, VA, USA.",
"School of Public Health, State University of New York, Albany 12237, USA.",
"Bundoora Extended Care Centre.", "", "", "Family Respite Center, Falls Church, VA, USA.",
"", "University of Victoria.", "", "Homemaker Health Aide Service of the National Capital Area.",
"West Lambeth Health Authority, London SE1 7EH.", "Bon Secours Hospital/Villa Maria Nursing Center, North Miami, FL 33161.",
"Alzheimer's Disease and Related Disorders Association, Syracuse, NY.",
"Alzheimer's Association, Washington DC.", "South Carolina Commission on Aging, Columbia.",
"University of New Mexico College of Nursing.", "Department of Human Development and Family Studies, University of Alabama, Tuscaloosa.",
"Ballard Health Care Residence, Des Plaines, IL.", "Bowman Gray School of Medicine of Wake Forest University, Winston-Salem, NC.",
"Case Western Reserve University.", "School of Public and Environmental Administration, Indiana University, Indianapolis 46202.",
"Manor HealthCare Corp, Silver Spring, MD.", "Relationship Builders, Napa, CA.",
"", "", "Medical University of South Carolina, USA.", "Tokyo Metropolitan Institute of Gerontology, Itabashi, Japan. [email protected]",
"Medical University of South Carolina, USA.", "Royal Hospital for Sick Children, Bristol.",
"Barefield, Ennis, Co. Clare., Ireland.", "North Georgia College, Dahlonega 30597, USA.",
"Institute for Psychology (I), University of Wurzburg, Germany.",
"Camborne Redruth Community Hospital, Cornwall, United Kingdom.",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "Institute of Child Health and Great Ormond Street Hospital for Children NHS Trust, London, UK.",
"Department of Psychiatry, University of Toronto, Toronto, Ontario, Canada. [email protected]",
"Boston University School of Social Work, MA 02215, USA.", "",
"Neurosciences Unit, General Infirmary at Leeds.", "", "", "School of Kang-Ning Junior College of Nursing, Nei-Hu, Taiwan, ROC.",
"College of Nursing, South Dakota State University, USA.", "Department of Geriatric Medicine, University of Manchester, UK.",
"Southern Illinois University, Department of Social Work, Edwardsville 62026-1450, USA.",
"Redlands Community College, El Reno, Oklahoma, USA.", "", "",
"Department of Geriatric Medicine, Alexandra Hospital, Singapore.",
"School of Nursing and Midwifery, Department of Gerontological and Continuing Care Nursing, University of Sheffield, Sheffield, England. [email protected]",
"", "State University of New York, Health Science Center at Syracuse, 13210, USA. [email protected]",
"Div. of Active Palliative Care, Todachuo General Hospital.",
"Children and Young People's Kidney Unit, Nottingham City Hospital, NHS Trust, UK.",
"School of Nursing & Midwifery, Department of Gerontological & Continuing Care Nursing, University of Sheffield. [email protected]",
"Harrington Memorial Hospital, Southbridge, MA, USA.", "", "Department of Curriculum and Instruction, Iowa State University, Ames, 50011. USA.",
"Children & Young People's Kidney Unit, Nottingham City Hospital, U.K.",
"School of Social Work, Boston University, MA 02215, USA. [email protected]",
"Royal Free Hospital, London, UK.", "Humboldt State University, Department of Nursing, Arcata, CA, USA.",
"Department of Psychiatry, The University of Queensland, Mental Health Centre, Royal Brisbane Hospital, Herston, Australia. [email protected]",
"Centre for Evidence Based Nursing, University of York, Heslington, York, Nth Yorkshire, UK, YO1 5DG. [email protected]",
"School of Nursing, University of British Columbia, Vancouver. [email protected]",
"Medisinsk avdeling, Lovisenberg Diakonale Sykehus, Oslo.", "School of Nursing, Yale University, USA.",
"Centre de la Mémoire, Hôpital Roger Salengro, Centre Hospitalier Universitaire, Lille.",
"University of Ulster and Eastern Health and Social Services Board, Ulster, Northern Ireland. [email protected]",
"Thames Valley Family Practice Research Unit, Department of Family Medicine's Centre for Studies in Family Medicine, University of Western Ontario (UWO), London. [email protected]",
"", "", "Department of Special Education, University of Nijmegen, The Netherlands. [email protected]",
"European Institute of Health and Medical Sciences, University of Surrey, Guildford, England.",
"California State University School of Nursing, Chico, USA.")
Within each string may or may not be a substring referring to a location, which itself may refer to country. The intended output is a dataframe as follows:
Affiliation[1], matchedCountry
Affiliation[2], matchedCountry
...
Affiliation[n], matchedCountry
"matchedCountry" is meant to be assessed based on several lists (university, UK cities, US states, etc.) and NA is allowed. And some lists only return ISO codes.
Based on the feedback thus far (thanks @rbm), I have managed a solution (see answer section) that does the job quite well. That said, I am sure performance could still be improved. Thanks.
References:
Upvotes: 1
Views: 743
Reputation: 97
Here's a solution, that checks various lists of substrings against each item in a master list, and then depending on the list, returns either: a) the original substring, b) an adjacent substring, or c) a fixed/pre-defined value. The result is the original table with a "country" column appended.
These conditions are represented in the sample code provided.
edit: it seems the "domain" look-up isnt working as intended. Im not quite sure how to troubleshoot/fix it, but that beyond the scope of this question, I guess...
######### GENERATE COUNTRY ID #############
library("stringr")
library(RCurl)
## Download country lists and perpetrate
countryList <- getURL("https://raw.githubusercontent.com/umpirsky/country-list/master/country/icu/en_US/country.csv")
usstatesList <- getURL("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv")
ukcitiesList <- getURL("https://raw.githubusercontent.com/encyclopediaio/list-of-cities-in-the-uk/master/src/uk_cities.csv")
ukcountryList <- getURL("https://raw.githubusercontent.com/Gibbs/UK-Postcodes/master/postcodes.csv")
universitiesList <- getURL("https://raw.githubusercontent.com/endSly/world-universities-csv/master/world-universities.csv")
countryList <- read.csv(text = countryList, stringsAsFactors=FALSE)
usstatesList <- read.csv(text = usstatesList, stringsAsFactors=FALSE)
ukcitiesList <- read.csv(text = ukcitiesList, stringsAsFactors=FALSE)
ukcountryList <- read.csv(text = ukcountryList, stringsAsFactors=FALSE)
universitiesList <- read.csv(text = universitiesList, header = FALSE, stringsAsFactors=FALSE)
## Generate affiliation list from ronbun data
affiliationList <- pub.data$Affiliation1
## Generate email domains column and add to countryList
domains <- function(x)
{
x <- tolower(x)
x <- paste0(".", x)
return(x)
}
countryList <- data.frame(countryList[c("name", "iso")], domain = domains(countryList$iso), stringsAsFactors = FALSE)
## Add country names to universitiesList as V4
universitiesList <- data.frame(universitiesList, V4="", stringsAsFactors = FALSE)
i = 0
for (v in universitiesList$V1)
{
tryCatch({
i = i + 1
if (sum(str_detect(v, countryList$iso)) > 0) {
universitiesList$V4[i] <- countryList$name[which(str_detect(v, countryList$iso))]
}
}, error=function(e){})
}
### on to the main show
df <- data.frame(affiliationList, CountryISO="", CountryNAME="", stringsAsFactors = FALSE)
i = 0
for (v in affiliationList)
{
tryCatch({
i = i + 1
if (sum(str_detect(v, countryList$name)) > 0) {
df$CountryISO[i] <- countryList$iso[which(str_detect(v, countryList$name))]
df$CountryNAME[i] <- countryList$name[which(str_detect(v, countryList$name))]
}
if (sum(str_detect(v, ukcitiesList$name)) > 0) {
df$CountryISO[i] <- "GB"
df$CountryNAME[i] <- "United Kingdom"
}
if (sum(str_detect(v, ukcountryList$country_string)) > 0) {
df$CountryISO[i] <- "GB"
df$CountryNAME[i] <- "United Kingdom"
}
if (sum(str_detect(v, usstatesList$State)) > 0 || sum(str_detect(v, usstatesList$Abbreviation)) > 0) {
df$CountryISO[i] <- "US"
df$CountryNAME[i] <- "United States"
}
if (sum(str_detect(v, countryList$domain)) > 0) {
df$CountryISO[i] <- countryList$iso[which(str_detect(v, countryList$domain))]
df$CountryNAME[i] <- countryList$name[which(str_detect(v, countryList$domain))]
}
if (sum(str_detect(v, universitiesList$V2)) > 0) {
df$CountryISO[i] <- universitiesList$V1[which(str_detect(v, universitiesList$V2))]
df$CountryNAME[i] <- universitiesList$V1[which(str_detect(v, universitiesList$V4))]
}
}, error=function(e){})
}
return(df)
Thanks for all the help provided!
Upvotes: 0