12,031 bytes added
, 31 January
Copy and paste the following code into R to extract data from the A.P.E.S. Wiki. Explanations are included in this script on how to extract data for specific regions and tables.
<pre>
### A.P.E.S. Wiki Web Scraper
# The following R code extracts data from the standardized A.P.E.S. Wiki tables across all, or a selection of, ape range regions.
# The functions below need to first be loaded into R.
install.packages("rvest")
install.packages("dplyr")
install.packages("httr")
library(rvest)
library(dplyr)
library(httr)
root_url <- "https://wiki.iucnapesportal.org"
main_page <- read_html(root_url)
# 1. Functions to get urls
get_all_site_urls <- function(root_url,region_sel){
all_urls <- vector()
regions <- get_region_urls(main_page)
regionss=matrix(unlist(strsplit(regions,split="/")),ncol=3,byrow=T)[,3]
if(sum(region_sel%in%"all")>0){
regions=regions
print("All regions are selected.")
}else{
regions=regions[regionss%in%region_sel]
print(paste(region_sel, " is selected.", sep=""))
}
for(region in regions){
country_urls <- get_country_urls(root_url, region)
for(country in country_urls){
site_urls <- get_site_urls(root_url, country)
all_urls <- c(all_urls, site_urls)
}
}
closeAllConnections()
all_urls
}
get_region_urls <- function(main_page){
region_links <- main_page %>%
html_nodes('.body')
region_links[7] %>%
html_nodes('a') %>%
html_attr('href')
}
get_country_urls <- function(root_url, region_url){
region_page <- tryCatch(content(GET(paste(root_url, region_url, sep = ""))), error=function(e) FALSE)
if(length(region_page) > 1){
region_page %>%
html_nodes('.mw-parser-output li a') %>%
html_attr('href')
}
}
get_site_urls <- function(root_url, country_url){
country_page <- tryCatch(content(GET(paste(root_url, country_url, sep = ""))), error=function(e) FALSE)
if(length(country_page) > 1){
country_page %>%
html_nodes('.mw-parser-output li a') %>%
html_attr('href')
}
}
# 2. Get location data
get_region_country_and_site <- function(site_page){
location_data <- site_page %>%
html_nodes('.mw-parser-output p a') %>%
html_text()
location_data[1:3]
}
# 3. Basic site information
get_site_characteristics <- function(site_page){
basic_information_table <- site_page %>%
html_nodes('.basic-information') %>%
html_table(fill = FALSE)
if(length(basic_information_table) > 0){
basic_information_table[[1]][0:4, 2]
}
}
# 4. Switch columns
switch_columns <- function(data_table){
data_table[c((ncol(data_table)-2):ncol(data_table), 1:(ncol(data_table)-3))]
}
get_table <- function(site_page, selector, main_table, location_data, ncolx=0){
table_data <- site_page %>%
html_nodes(selector) %>%
html_table(fill = FALSE)
if(length(table_data) > 0){
if(ncolx!=0){
table_data <- table_data[[1]][,1:ncolx]
}else{
table_data <- table_data[[1]]
}
table_data <- table_data %>%
mutate_all(as.character)
table_data=as.data.frame(table_data)
colnames(table_data) <- colnames(main_table)[1:(ncol(main_table)-3)]
table_data$Region <- location_data[1]
table_data$Country <- location_data[2]
table_data$Site <- location_data[3]
table_data
} else {
print(paste(location_data[3], selector, "table not added"), sep = "")
NA
}
}
# 5. Main function to get data (uses the functions above)
get_all_site_tables <- function(root_url, region_sel, tables_sel){
all_site_urls <- sort(get_all_site_urls(root_url,region_sel))
xx=grepl("index.php/",all_site_urls)
all_site_urls=all_site_urls[xx]
# browser()
logtable <- data.frame(link=paste(root_url, all_site_urls, sep = ""),
sitetype=rep("",length(all_site_urls))
)
if(sum(tables_sel%in%"site_characteristics_table")>0){
site_characteristics_table <- data.frame(Region=character(),
Country=character(),
Site=character(),
Area=character(),
Coordinates=character(),
Designation=character(),
'Habitat types'=character(),
check.names=FALSE,
stringsAsFactors=FALSE)
}
if(sum(tables_sel%in%"ape_status_table")>0){
ape_status_table <- data.frame(Species=character(),
Year=character(),
'Abundance estimate (95% CI)'=character(),
'Density estimate [ind./ kmĀ²] (95% CI)'=character(),
'Encounter rate (nests/km)'=character(),
Area=character(),
Method=character(),
Source=character(),
Comments=character(),
'A.P.E.S. database ID'=character(),
Region=character(),
Country=character(),
Site=character(),
check.names=FALSE,
stringsAsFactors=FALSE)
}
if(sum(tables_sel%in%"threats_table")>0){
threats_table <- data.frame(Category=character(),
'Specific threats'=character(),
'Threat level'=character(),
'Quantified severity'=character(),
Description=character(),
'Year of threat'=character(),
Region=character(),
Country=character(),
Site=character(),
check.names=FALSE,
stringsAsFactors=FALSE)
}
if(sum(tables_sel%in%"conservation_activities_table")>0){
conservation_activities_table <- data.frame(Category=character(),
'Specific activity'=character(),
'Description'=character(),
'Year of activity'=character(),
Region=character(),
Country=character(),
Site=character(),
check.names=FALSE,
stringsAsFactors=FALSE)
}
if(sum(tables_sel%in%"challenges_table")>0){
challenges_table <- data.frame(Challenge=character(),
Source=character(),
Region=character(),
Country=character(),
Site=character(),
check.names=FALSE,
stringsAsFactors=FALSE)
}
if(sum(tables_sel%in%"behaviours_table")>0){
behaviours_table <- data.frame(Behavior=character(),
Source=character(),
Region=character(),
Country=character(),
Site=character(),
check.names=FALSE,
stringsAsFactors=FALSE)
}
for(i in 1:length(all_site_urls)){
# for(i in 100:110){
site_page <- tryCatch(content(GET(paste(root_url, all_site_urls[i], sep = ""))), error=function(e) FALSE)
if(length(site_page) < 2){
logtable$sitetype[i]="no page"
print(paste(i, "of", length(all_site_urls), " ", "no page", all_site_urls[i], sep = " "))
}else{
location_data <- get_region_country_and_site(site_page)
if(is.na(location_data[1])){
logtable$sitetype[i]="link to construction page"
print(paste(i, "of", length(all_site_urls), " ", "link to construction page", all_site_urls[i], sep = " "))
}else{
if(location_data[1]=="Region"){
logtable$sitetype[i]="link to empty content page"
print(paste(i, "of", length(all_site_urls), " ", "link to empty content page", all_site_urls[i], sep = " "))
}else{
logtable$sitetype[i]="link to filled content page"
print(paste(i, "of", length(all_site_urls), " ", "link to filled content page", all_site_urls[i], sep = " "))
site_characteristics <- unlist(get_site_characteristics(site_page))
if(length(site_characteristics) > 1){
if(sum(tables_sel%in%"site_characteristics_table")>0){
site_characteristics_table[nrow(site_characteristics_table) + 1, ] <- c(location_data, site_characteristics)
}
} else {
print(paste(location_data[3], "basic information table not added"))
}
#ape_status_table <- rbind(ape_status_table, get_table(site_page, '.population-estimate-table', ape_status_table, location_data))
if(sum(tables_sel%in%"ape_status_table")>0){
xx=get_table(site_page, '.population-estimate-table', ape_status_table, location_data, ncol=10)
if(is.data.frame(xx)){ape_status_table <- rbind(ape_status_table, xx)}
switch_columns(ape_status_table)
}
if(sum(tables_sel%in%"threats_table")>0){
xx=get_table(site_page, '.threats-table', threats_table, location_data, ncol=6)
if(is.data.frame(xx)){threats_table <- rbind(threats_table, xx)}
switch_columns(threats_table)
}
if(sum(tables_sel%in%"conservation_activities_table")>0){
xx=get_table(site_page, '.conservation-actions-table', conservation_activities_table, location_data, ncol=0)
if(is.data.frame(xx)){conservation_activities_table <- rbind(conservation_activities_table, xx)}
switch_columns(conservation_activities_table)
}
if(sum(tables_sel%in%"behaviours_table")>0){
xx=get_table(site_page, '.behaviors-table', behaviours_table, location_data, ncol=0)
if(is.data.frame(xx)){behaviours_table <- rbind(behaviours_table, xx)}
switch_columns(behaviours_table)
}
if(sum(tables_sel%in%"challenges_table")>0){
xx=get_table(site_page, '.challenges-table', challenges_table, location_data, ncol=0)
if(is.data.frame(xx)){challenges_table <- rbind(challenges_table, xx)}
switch_columns(challenges_table)
}
#challenges_table <- rbind(challenges_table, get_table(site_page, '.challenges-table', challenges_table, location_data))
}
}
}
closeAllConnections()
}
tablesreturn=c("logtable",tables_sel)
xx=lapply(tablesreturn,function(x){get(x)})
names(xx)=tablesreturn
return(xx)
}
# After loading the functions, select the regions and tables that you need to extract data from.
# 1. REGIONS
# Please define 'region_sel'; it could have the following values:
# "all" = all regions would be read
# or one or a combination of: "West_Africa","Central_Africa","East_Africa","Asia"
# E.g., region_sel=c("East_Africa", "Asia")
# 2. TABLES
# Please define 'tables_sel' from the following options:
# a combination of: "site_characteristics_table","threats_table","conservation_activities_table","behaviours_table"
# e.g.
region_sel="all"
tables_sel=c("site_characteristics_table","threats_table","conservation_activities_table","behaviours_table","challenges_table","ape_status_table")
# Creates "all_tables", which is a list with one entry for each table
all_tables <- get_all_site_tables(root_url, region_sel, tables_sel)
# The output is a list of dataframes, corresponding to the tables
str(all_tables)
</pre>