Data Scraping
Jump to navigation
Jump to search
### A.P.E.S. Wiki Web Scraper # The following R code extracts data from the standardized A.P.E.S. Wiki tables across all, or a selection of, ape range regions. # The functions below need to first be loaded into R. install.packages("rvest") install.packages("dplyr") install.packages("httr") library(rvest) library(dplyr) library(httr) root_url <- "https://wiki.iucnapesportal.org" main_page <- read_html(root_url) # 1. Functions to get urls get_all_site_urls <- function(root_url,region_sel){ all_urls <- vector() regions <- get_region_urls(main_page) regionss=matrix(unlist(strsplit(regions,split="/")),ncol=3,byrow=T)[,3] if(sum(region_sel%in%"all")>0){ regions=regions print("All regions are selected.") }else{ regions=regions[regionss%in%region_sel] print(paste(region_sel, " is selected.", sep="")) } for(region in regions){ country_urls <- get_country_urls(root_url, region) for(country in country_urls){ site_urls <- get_site_urls(root_url, country) all_urls <- c(all_urls, site_urls) } } closeAllConnections() all_urls } get_region_urls <- function(main_page){ region_links <- main_page %>% html_nodes('.body') region_links[7] %>% html_nodes('a') %>% html_attr('href') } get_country_urls <- function(root_url, region_url){ region_page <- tryCatch(content(GET(paste(root_url, region_url, sep = ""))), error=function(e) FALSE) if(length(region_page) > 1){ region_page %>% html_nodes('.mw-parser-output li a') %>% html_attr('href') } } get_site_urls <- function(root_url, country_url){ country_page <- tryCatch(content(GET(paste(root_url, country_url, sep = ""))), error=function(e) FALSE) if(length(country_page) > 1){ country_page %>% html_nodes('.mw-parser-output li a') %>% html_attr('href') } } # 2. Get location data get_region_country_and_site <- function(site_page){ location_data <- site_page %>% html_nodes('.mw-parser-output p a') %>% html_text() location_data[1:3] } # 3. Basic site information get_site_characteristics <- function(site_page){ basic_information_table <- site_page %>% html_nodes('.basic-information') %>% html_table(fill = FALSE) if(length(basic_information_table) > 0){ basic_information_table[[1]][0:4, 2] } } # 4. Switch columns switch_columns <- function(data_table){ data_table[c((ncol(data_table)-2):ncol(data_table), 1:(ncol(data_table)-3))] } get_table <- function(site_page, selector, main_table, location_data, ncolx=0){ table_data <- site_page %>% html_nodes(selector) %>% html_table(fill = FALSE) if(length(table_data) > 0){ if(ncolx!=0){ table_data <- table_data[[1]][,1:ncolx] }else{ table_data <- table_data[[1]] } table_data <- table_data %>% mutate_all(as.character) table_data=as.data.frame(table_data) colnames(table_data) <- colnames(main_table)[1:(ncol(main_table)-3)] table_data$Region <- location_data[1] table_data$Country <- location_data[2] table_data$Site <- location_data[3] table_data } else { print(paste(location_data[3], selector, "table not added"), sep = "") NA } } # 5. Main function to get data (uses the functions above) get_all_site_tables <- function(root_url, region_sel, tables_sel){ all_site_urls <- sort(get_all_site_urls(root_url,region_sel)) xx=grepl("index.php/",all_site_urls) all_site_urls=all_site_urls[xx] # browser() logtable <- data.frame(link=paste(root_url, all_site_urls, sep = ""), sitetype=rep("",length(all_site_urls)) ) if(sum(tables_sel%in%"site_characteristics_table")>0){ site_characteristics_table <- data.frame(Region=character(), Country=character(), Site=character(), Area=character(), Coordinates=character(), Designation=character(), 'Habitat types'=character(), check.names=FALSE, stringsAsFactors=FALSE) } if(sum(tables_sel%in%"ape_status_table")>0){ ape_status_table <- data.frame(Species=character(), Year=character(), 'Abundance estimate (95% CI)'=character(), 'Density estimate [ind./ km²] (95% CI)'=character(), 'Encounter rate (nests/km)'=character(), Area=character(), Method=character(), Source=character(), Comments=character(), 'A.P.E.S. database ID'=character(), Region=character(), Country=character(), Site=character(), check.names=FALSE, stringsAsFactors=FALSE) } if(sum(tables_sel%in%"threats_table")>0){ threats_table <- data.frame(Category=character(), 'Specific threats'=character(), 'Threat level'=character(), 'Quantified severity'=character(), Description=character(), 'Year of threat'=character(), Region=character(), Country=character(), Site=character(), check.names=FALSE, stringsAsFactors=FALSE) } if(sum(tables_sel%in%"conservation_activities_table")>0){ conservation_activities_table <- data.frame(Category=character(), 'Specific activity'=character(), 'Description'=character(), 'Year of activity'=character(), Region=character(), Country=character(), Site=character(), check.names=FALSE, stringsAsFactors=FALSE) } if(sum(tables_sel%in%"challenges_table")>0){ challenges_table <- data.frame(Challenge=character(), Source=character(), Region=character(), Country=character(), Site=character(), check.names=FALSE, stringsAsFactors=FALSE) } if(sum(tables_sel%in%"behaviours_table")>0){ behaviours_table <- data.frame(Behavior=character(), Source=character(), Region=character(), Country=character(), Site=character(), check.names=FALSE, stringsAsFactors=FALSE) } for(i in 1:length(all_site_urls)){ # for(i in 100:110){ site_page <- tryCatch(content(GET(paste(root_url, all_site_urls[i], sep = ""))), error=function(e) FALSE) if(length(site_page) < 2){ logtable$sitetype[i]="no page" print(paste(i, "of", length(all_site_urls), " ", "no page", all_site_urls[i], sep = " ")) }else{ location_data <- get_region_country_and_site(site_page) if(is.na(location_data[1])){ logtable$sitetype[i]="link to construction page" print(paste(i, "of", length(all_site_urls), " ", "link to construction page", all_site_urls[i], sep = " ")) }else{ if(location_data[1]=="Region"){ logtable$sitetype[i]="link to empty content page" print(paste(i, "of", length(all_site_urls), " ", "link to empty content page", all_site_urls[i], sep = " ")) }else{ logtable$sitetype[i]="link to filled content page" print(paste(i, "of", length(all_site_urls), " ", "link to filled content page", all_site_urls[i], sep = " ")) site_characteristics <- unlist(get_site_characteristics(site_page)) if(length(site_characteristics) > 1){ if(sum(tables_sel%in%"site_characteristics_table")>0){ site_characteristics_table[nrow(site_characteristics_table) + 1, ] <- c(location_data, site_characteristics) } } else { print(paste(location_data[3], "basic information table not added")) } #ape_status_table <- rbind(ape_status_table, get_table(site_page, '.population-estimate-table', ape_status_table, location_data)) if(sum(tables_sel%in%"ape_status_table")>0){ xx=get_table(site_page, '.population-estimate-table', ape_status_table, location_data, ncol=10) if(is.data.frame(xx)){ape_status_table <- rbind(ape_status_table, xx)} switch_columns(ape_status_table) } if(sum(tables_sel%in%"threats_table")>0){ xx=get_table(site_page, '.threats-table', threats_table, location_data, ncol=6) if(is.data.frame(xx)){threats_table <- rbind(threats_table, xx)} switch_columns(threats_table) } if(sum(tables_sel%in%"conservation_activities_table")>0){ xx=get_table(site_page, '.conservation-actions-table', conservation_activities_table, location_data, ncol=0) if(is.data.frame(xx)){conservation_activities_table <- rbind(conservation_activities_table, xx)} switch_columns(conservation_activities_table) } if(sum(tables_sel%in%"behaviours_table")>0){ xx=get_table(site_page, '.behaviors-table', behaviours_table, location_data, ncol=0) if(is.data.frame(xx)){behaviours_table <- rbind(behaviours_table, xx)} switch_columns(behaviours_table) } if(sum(tables_sel%in%"challenges_table")>0){ xx=get_table(site_page, '.challenges-table', challenges_table, location_data, ncol=0) if(is.data.frame(xx)){challenges_table <- rbind(challenges_table, xx)} switch_columns(challenges_table) } #challenges_table <- rbind(challenges_table, get_table(site_page, '.challenges-table', challenges_table, location_data)) } } } closeAllConnections() } tablesreturn=c("logtable",tables_sel) xx=lapply(tablesreturn,function(x){get(x)}) names(xx)=tablesreturn return(xx) } # After loading the functions, select the regions and tables that you need to extract data from. # 1. REGIONS # Please define 'region_sel'; it could have the following values: # "all" = all regions would be read # or one or a combination of: "West_Africa","Central_Africa","East_Africa","Asia" # E.g., region_sel=c("East_Africa", "Asia") # 2. TABLES # Please define 'tables_sel' from the following options: # a combination of: "site_characteristics_table","threats_table","conservation_activities_table","behaviours_table" # e.g. region_sel="all" tables_sel=c("site_characteristics_table","threats_table","conservation_activities_table","behaviours_table","challenges_table","ape_status_table") # Creates "all_tables", which is a list with one entry for each table all_tables <- get_all_site_tables(root_url, region_sel, tables_sel) # The output is a list of dataframes, corresponding to the tables str(all_tables)