Difference between revisions of "Data Scraping"

From A.P.E.S. wiki
Jump to navigation Jump to search
Line 1: Line 1:
  
 
+
<nowiki>
 
  A.P.E.S. Wiki Web Scraper  
 
  A.P.E.S. Wiki Web Scraper  
  

Revision as of 07:25, 2 August 2023

<nowiki>

A.P.E.S. Wiki Web Scraper 
  1. The following R code extracts data from the standardized A.P.E.S. Wiki tables across all, or a selection of, ape range regions.
  2. The functions below need to first be loaded into R.

install.packages("rvest") install.packages("dplyr") install.packages("httr") library(rvest) library(dplyr) library(httr)

root_url <- "https://wiki.iucnapesportal.org" main_page <- read_html(root_url)

  1. 1. Functions to get urls

get_all_site_urls <- function(root_url,region_sel){

 all_urls <- vector()
 regions <- get_region_urls(main_page)
 regionss=matrix(unlist(strsplit(regions,split="/")),ncol=3,byrow=T)[,3]
 if(sum(region_sel%in%"all")>0){
   regions=regions
   print("All regions are selected.")
 }else{
   regions=regions[regionss%in%region_sel]
   print(paste(region_sel, " is selected.", sep=""))
 }
 for(region in regions){
   country_urls <- get_country_urls(root_url, region)
   for(country in country_urls){
     site_urls <- get_site_urls(root_url, country)
     all_urls <- c(all_urls, site_urls)
   }
 }
 closeAllConnections()
 all_urls

}

get_region_urls <- function(main_page){

 region_links <- main_page %>% 
   html_nodes('.body')
 region_links[7] %>% 
   html_nodes('a') %>% 
   html_attr('href')

}

get_country_urls <- function(root_url, region_url){

 region_page <- tryCatch(content(GET(paste(root_url, region_url, sep = ""))), error=function(e) FALSE)
 if(length(region_page) > 1){
   region_page %>% 
     html_nodes('.mw-parser-output li a') %>% 
     html_attr('href')
 }

}

get_site_urls <- function(root_url, country_url){

 country_page <- tryCatch(content(GET(paste(root_url, country_url, sep = ""))), error=function(e) FALSE)
 if(length(country_page) > 1){
   country_page %>% 
     html_nodes('.mw-parser-output li a') %>% 
     html_attr('href')
 }

}

  1. 2. Get location data

get_region_country_and_site <- function(site_page){

 location_data <- site_page %>% 
   html_nodes('.mw-parser-output p a') %>% 
   html_text()
 location_data[1:3]

}

  1. 3. Basic site information

get_site_characteristics <- function(site_page){

 basic_information_table <- site_page %>% 
   html_nodes('.basic-information') %>%
   html_table(fill = FALSE)
 if(length(basic_information_table) > 0){
   basic_information_table1[0:4, 2]
 }

}

  1. 4. Switch columns

switch_columns <- function(data_table){

 data_table[c((ncol(data_table)-2):ncol(data_table), 1:(ncol(data_table)-3))]

}

get_table <- function(site_page, selector, main_table, location_data, ncolx=0){

 table_data <- site_page %>%
   html_nodes(selector) %>%
   html_table(fill = FALSE)
 if(length(table_data) > 0){
   if(ncolx!=0){
     table_data <- table_data1[,1:ncolx]
   }else{
     table_data <- table_data1
   }  
   table_data <- table_data %>%
     mutate_all(as.character)
   table_data=as.data.frame(table_data)
   colnames(table_data) <- colnames(main_table)[1:(ncol(main_table)-3)]
   table_data$Region <- location_data[1]
   table_data$Country <- location_data[2]
   table_data$Site <- location_data[3]
   table_data
   } else {
   print(paste(location_data[3], selector, "table not added"), sep = "")
   NA
 }

}

  1. 5. Main function to get data (uses the functions above)

get_all_site_tables <- function(root_url, region_sel, tables_sel){

 all_site_urls <- sort(get_all_site_urls(root_url,region_sel))
 xx=grepl("index.php/",all_site_urls)
 all_site_urls=all_site_urls[xx]
  1. browser()
 logtable <- data.frame(link=paste(root_url, all_site_urls, sep = ""),
                        sitetype=rep("",length(all_site_urls))
                       )
 
 if(sum(tables_sel%in%"site_characteristics_table")>0){
 site_characteristics_table <- data.frame(Region=character(),
                                            Country=character(),
                                            Site=character(),
                                            Area=character(),
                                            Coordinates=character(),
                                            Designation=character(),
                                            'Habitat types'=character(),
                                            check.names=FALSE,
                                            stringsAsFactors=FALSE)
 }
  if(sum(tables_sel%in%"ape_status_table")>0){
  ape_status_table <- data.frame(Species=character(),
                                 Year=character(),
                                 'Abundance estimate (95% CI)'=character(),
                                 'Density estimate [ind./ km²] (95% CI)'=character(),
                                 'Encounter rate (nests/km)'=character(),
                                 Area=character(),
                               Method=character(),
                               Source=character(),
                               Comments=character(),
                               'A.P.E.S. database ID'=character(),
                               Region=character(),
                               Country=character(),
                               Site=character(),
                               check.names=FALSE,
                               stringsAsFactors=FALSE)
 }
 if(sum(tables_sel%in%"threats_table")>0){
 threats_table <- data.frame(Category=character(),
                             'Specific threats'=character(),
                             'Threat level'=character(),
                             'Quantified severity'=character(),
                             Description=character(),
                             'Year of threat'=character(),
                             Region=character(),
                             Country=character(),
                             Site=character(),
                             check.names=FALSE,
                             stringsAsFactors=FALSE)
 
 }
 if(sum(tables_sel%in%"conservation_activities_table")>0){
 conservation_activities_table <- data.frame(Category=character(),
                                             'Specific activity'=character(),
                                             'Description'=character(),
                                             'Year of activity'=character(),
                                             Region=character(),
                                             Country=character(),
                                             Site=character(),
                                             check.names=FALSE,
                                             stringsAsFactors=FALSE)
 }
 if(sum(tables_sel%in%"challenges_table")>0){
 challenges_table <- data.frame(Challenge=character(),
                                Source=character(),
                                Region=character(),
                                Country=character(),
                                Site=character(),
                                check.names=FALSE,
                                stringsAsFactors=FALSE)
 }
 if(sum(tables_sel%in%"behaviours_table")>0){
 behaviours_table <- data.frame(Behavior=character(),
                                Source=character(),
                                Region=character(),
                                Country=character(),
                                Site=character(),
                                check.names=FALSE,
                                stringsAsFactors=FALSE)
 }
 for(i in 1:length(all_site_urls)){
  1. for(i in 100:110){
   site_page <- tryCatch(content(GET(paste(root_url, all_site_urls[i], sep = ""))), error=function(e) FALSE)
   if(length(site_page) < 2){
     logtable$sitetype[i]="no page"
     print(paste(i, "of", length(all_site_urls), " ", "no page", all_site_urls[i], sep = " "))
   }else{
     location_data <- get_region_country_and_site(site_page)
     if(is.na(location_data[1])){
       logtable$sitetype[i]="link to construction page"
       print(paste(i, "of", length(all_site_urls), " ", "link to construction page", all_site_urls[i], sep = " "))
     }else{
       if(location_data[1]=="Region"){
         logtable$sitetype[i]="link to empty content page"
         print(paste(i, "of", length(all_site_urls), " ", "link to empty content page", all_site_urls[i], sep = " "))
       }else{
         logtable$sitetype[i]="link to filled content page"
         print(paste(i, "of", length(all_site_urls), " ", "link to filled content page", all_site_urls[i], sep = " "))
         site_characteristics <- unlist(get_site_characteristics(site_page))
         if(length(site_characteristics) > 1){
           if(sum(tables_sel%in%"site_characteristics_table")>0){
             site_characteristics_table[nrow(site_characteristics_table) + 1, ] <- c(location_data, site_characteristics)
           }  
         } else {
           print(paste(location_data[3], "basic information table not added"))
         }
         #ape_status_table <- rbind(ape_status_table, get_table(site_page, '.population-estimate-table', ape_status_table, location_data))

if(sum(tables_sel%in%"ape_status_table")>0){

           xx=get_table(site_page, '.population-estimate-table', ape_status_table, location_data, ncol=10)
           if(is.data.frame(xx)){ape_status_table <- rbind(ape_status_table, xx)}
           switch_columns(ape_status_table)
         }

if(sum(tables_sel%in%"threats_table")>0){

           xx=get_table(site_page, '.threats-table', threats_table, location_data, ncol=6)
           if(is.data.frame(xx)){threats_table <- rbind(threats_table, xx)}
           switch_columns(threats_table)
         }
         if(sum(tables_sel%in%"conservation_activities_table")>0){
           xx=get_table(site_page, '.conservation-actions-table', conservation_activities_table, location_data, ncol=0)
           if(is.data.frame(xx)){conservation_activities_table <- rbind(conservation_activities_table, xx)}
           switch_columns(conservation_activities_table)
         } 
         if(sum(tables_sel%in%"behaviours_table")>0){
           xx=get_table(site_page, '.behaviors-table', behaviours_table, location_data, ncol=0)
           if(is.data.frame(xx)){behaviours_table <- rbind(behaviours_table, xx)}
           switch_columns(behaviours_table)
         }

if(sum(tables_sel%in%"challenges_table")>0){

           xx=get_table(site_page, '.challenges-table', challenges_table, location_data, ncol=0)
           if(is.data.frame(xx)){challenges_table <- rbind(challenges_table, xx)}
           switch_columns(challenges_table)
         }
         #challenges_table <- rbind(challenges_table, get_table(site_page, '.challenges-table', challenges_table, location_data))
       }
     }
   }
   closeAllConnections()
 }
 tablesreturn=c("logtable",tables_sel)
 xx=lapply(tablesreturn,function(x){get(x)})
 names(xx)=tablesreturn
 return(xx)

}


  1. After loading the functions, select the regions and tables that you need to extract data from.
  1. 1. REGIONS
 # Please define 'region_sel'; it could have the following values: 
 # "all" = all regions would be read
 # or one or a combination of: "West_Africa","Central_Africa","East_Africa","Asia" 
 # E.g., region_sel=c("East_Africa", "Asia")
  1. 2. TABLES
 # Please define 'tables_sel' from the following options: 
 # a combination of: "site_characteristics_table","threats_table","conservation_activities_table","behaviours_table"
 # e.g.
 

region_sel="all" tables_sel=c("site_characteristics_table","threats_table","conservation_activities_table","behaviours_table","challenges_table","ape_status_table")

  1. Creates "all_tables", which is a list with one entry for each table

all_tables <- get_all_site_tables(root_url, region_sel, tables_sel)

  1. The output is a list of dataframes, corresponding to the tables

str(all_tables)