Difference between revisions of "Data scraping"

From A.P.E.S. wiki
Jump to navigation Jump to search
(Created page with " Copy and paste the following code into R to extract data from the A.P.E.S. Wiki. Explanations are included in this script on how to extract data for specific regions and tabl...")
 
(No difference)

Latest revision as of 04:26, 31 January 2024

Copy and paste the following code into R to extract data from the A.P.E.S. Wiki. Explanations are included in this script on how to extract data for specific regions and tables.

### A.P.E.S. Wiki Web Scraper 

# The following R code extracts data from the standardized A.P.E.S. Wiki tables across all, or a selection of, ape range regions.
# The functions below need to first be loaded into R.

install.packages("rvest")
install.packages("dplyr")
install.packages("httr")
library(rvest)
library(dplyr)
library(httr)

root_url <- "https://wiki.iucnapesportal.org"
main_page <- read_html(root_url)

# 1. Functions to get urls
get_all_site_urls <- function(root_url,region_sel){
  all_urls <- vector()
  regions <- get_region_urls(main_page)
  regionss=matrix(unlist(strsplit(regions,split="/")),ncol=3,byrow=T)[,3]
  if(sum(region_sel%in%"all")>0){
    regions=regions
    print("All regions are selected.")
  }else{
    regions=regions[regionss%in%region_sel]
    print(paste(region_sel, " is selected.", sep=""))
  }
  for(region in regions){
    country_urls <- get_country_urls(root_url, region)
    for(country in country_urls){
      site_urls <- get_site_urls(root_url, country)
      all_urls <- c(all_urls, site_urls)
    }
  }
  closeAllConnections()
  all_urls
}

get_region_urls <- function(main_page){
  region_links <- main_page %>% 
    html_nodes('.body')
  region_links[7] %>% 
    html_nodes('a') %>% 
    html_attr('href')
}

get_country_urls <- function(root_url, region_url){
  region_page <- tryCatch(content(GET(paste(root_url, region_url, sep = ""))), error=function(e) FALSE)
  if(length(region_page) > 1){
    region_page %>% 
      html_nodes('.mw-parser-output li a') %>% 
      html_attr('href')
  }
}

get_site_urls <- function(root_url, country_url){
  country_page <- tryCatch(content(GET(paste(root_url, country_url, sep = ""))), error=function(e) FALSE)
  if(length(country_page) > 1){
    country_page %>% 
      html_nodes('.mw-parser-output li a') %>% 
      html_attr('href')
  }
}

# 2. Get location data
get_region_country_and_site <- function(site_page){
  location_data <- site_page %>% 
    html_nodes('.mw-parser-output p a') %>% 
    html_text()
  location_data[1:3]
}

# 3. Basic site information
get_site_characteristics <- function(site_page){
  basic_information_table <- site_page %>% 
    html_nodes('.basic-information') %>%
    html_table(fill = FALSE)
  if(length(basic_information_table) > 0){
    basic_information_table[[1]][0:4, 2]
  }
}

# 4. Switch columns
switch_columns <- function(data_table){
  data_table[c((ncol(data_table)-2):ncol(data_table), 1:(ncol(data_table)-3))]
}

get_table <- function(site_page, selector, main_table, location_data, ncolx=0){
  table_data <- site_page %>%
    html_nodes(selector) %>%
    html_table(fill = FALSE)
  if(length(table_data) > 0){
    if(ncolx!=0){
      table_data <- table_data[[1]][,1:ncolx]
    }else{
      table_data <- table_data[[1]]
    }  
    table_data <- table_data %>%
      mutate_all(as.character)
    table_data=as.data.frame(table_data)
    colnames(table_data) <- colnames(main_table)[1:(ncol(main_table)-3)]
    table_data$Region <- location_data[1]
    table_data$Country <- location_data[2]
    table_data$Site <- location_data[3]
    table_data
    } else {
    print(paste(location_data[3], selector, "table not added"), sep = "")
    NA
  }
}

# 5. Main function to get data (uses the functions above)
get_all_site_tables <- function(root_url, region_sel, tables_sel){
  all_site_urls <- sort(get_all_site_urls(root_url,region_sel))
  xx=grepl("index.php/",all_site_urls)
  all_site_urls=all_site_urls[xx]
# browser()  
  logtable <- data.frame(link=paste(root_url, all_site_urls, sep = ""),
                         sitetype=rep("",length(all_site_urls))
                        )
  
  if(sum(tables_sel%in%"site_characteristics_table")>0){
  site_characteristics_table <- data.frame(Region=character(),
                                             Country=character(),
                                             Site=character(),
                                             Area=character(),
                                             Coordinates=character(),
                                             Designation=character(),
                                             'Habitat types'=character(),
                                             check.names=FALSE,
                                             stringsAsFactors=FALSE)
  }
   if(sum(tables_sel%in%"ape_status_table")>0){
   ape_status_table <- data.frame(Species=character(),
                                  Year=character(),
                                  'Abundance estimate (95% CI)'=character(),
                                  'Density estimate [ind./ km²] (95% CI)'=character(),
                                  'Encounter rate (nests/km)'=character(),
                                  Area=character(),
                                Method=character(),
                                Source=character(),
                                Comments=character(),
                                'A.P.E.S. database ID'=character(),
                                Region=character(),
                                Country=character(),
                                Site=character(),
                                check.names=FALSE,
                                stringsAsFactors=FALSE)
  }
  if(sum(tables_sel%in%"threats_table")>0){
  threats_table <- data.frame(Category=character(),
                              'Specific threats'=character(),
                              'Threat level'=character(),
                              'Quantified severity'=character(),
                              Description=character(),
                              'Year of threat'=character(),
                              Region=character(),
                              Country=character(),
                              Site=character(),
                              check.names=FALSE,
                              stringsAsFactors=FALSE)
  
  }
  if(sum(tables_sel%in%"conservation_activities_table")>0){
  conservation_activities_table <- data.frame(Category=character(),
                                              'Specific activity'=character(),
                                              'Description'=character(),
                                              'Year of activity'=character(),
                                              Region=character(),
                                              Country=character(),
                                              Site=character(),
                                              check.names=FALSE,
                                              stringsAsFactors=FALSE)

  }
  if(sum(tables_sel%in%"challenges_table")>0){
  challenges_table <- data.frame(Challenge=character(),
                                 Source=character(),
                                 Region=character(),
                                 Country=character(),
                                 Site=character(),
                                 check.names=FALSE,
                                 stringsAsFactors=FALSE)

  }
  if(sum(tables_sel%in%"behaviours_table")>0){
  behaviours_table <- data.frame(Behavior=character(),
                                 Source=character(),
                                 Region=character(),
                                 Country=character(),
                                 Site=character(),
                                 check.names=FALSE,
                                 stringsAsFactors=FALSE)

  }
  for(i in 1:length(all_site_urls)){
#   for(i in 100:110){
    site_page <- tryCatch(content(GET(paste(root_url, all_site_urls[i], sep = ""))), error=function(e) FALSE)
    if(length(site_page) < 2){
      logtable$sitetype[i]="no page"
      print(paste(i, "of", length(all_site_urls), " ", "no page", all_site_urls[i], sep = " "))
    }else{
      location_data <- get_region_country_and_site(site_page)
      if(is.na(location_data[1])){
        logtable$sitetype[i]="link to construction page"
        print(paste(i, "of", length(all_site_urls), " ", "link to construction page", all_site_urls[i], sep = " "))
      }else{
        if(location_data[1]=="Region"){
          logtable$sitetype[i]="link to empty content page"
          print(paste(i, "of", length(all_site_urls), " ", "link to empty content page", all_site_urls[i], sep = " "))
        }else{
          logtable$sitetype[i]="link to filled content page"
          print(paste(i, "of", length(all_site_urls), " ", "link to filled content page", all_site_urls[i], sep = " "))
          site_characteristics <- unlist(get_site_characteristics(site_page))
          if(length(site_characteristics) > 1){
            if(sum(tables_sel%in%"site_characteristics_table")>0){
              site_characteristics_table[nrow(site_characteristics_table) + 1, ] <- c(location_data, site_characteristics)
            }  
          } else {
            print(paste(location_data[3], "basic information table not added"))
          }
          #ape_status_table <- rbind(ape_status_table, get_table(site_page, '.population-estimate-table', ape_status_table, location_data))
		  if(sum(tables_sel%in%"ape_status_table")>0){
            xx=get_table(site_page, '.population-estimate-table', ape_status_table, location_data, ncol=10)
            if(is.data.frame(xx)){ape_status_table <- rbind(ape_status_table, xx)}
            switch_columns(ape_status_table)
          }
		  if(sum(tables_sel%in%"threats_table")>0){
            xx=get_table(site_page, '.threats-table', threats_table, location_data, ncol=6)
            if(is.data.frame(xx)){threats_table <- rbind(threats_table, xx)}
            switch_columns(threats_table)
          }
          if(sum(tables_sel%in%"conservation_activities_table")>0){
            xx=get_table(site_page, '.conservation-actions-table', conservation_activities_table, location_data, ncol=0)
            if(is.data.frame(xx)){conservation_activities_table <- rbind(conservation_activities_table, xx)}
            switch_columns(conservation_activities_table)
          } 
          if(sum(tables_sel%in%"behaviours_table")>0){
            xx=get_table(site_page, '.behaviors-table', behaviours_table, location_data, ncol=0)
            if(is.data.frame(xx)){behaviours_table <- rbind(behaviours_table, xx)}
            switch_columns(behaviours_table)
          }
		  if(sum(tables_sel%in%"challenges_table")>0){
            xx=get_table(site_page, '.challenges-table', challenges_table, location_data, ncol=0)
            if(is.data.frame(xx)){challenges_table <- rbind(challenges_table, xx)}
            switch_columns(challenges_table)
          }
          #challenges_table <- rbind(challenges_table, get_table(site_page, '.challenges-table', challenges_table, location_data))
        }
      }
    }
    closeAllConnections()
  }
  tablesreturn=c("logtable",tables_sel)
  xx=lapply(tablesreturn,function(x){get(x)})
  names(xx)=tablesreturn
  return(xx)
}


# After loading the functions, select the regions and tables that you need to extract data from.

# 1. REGIONS 
  # Please define 'region_sel'; it could have the following values: 
  # "all" = all regions would be read
  # or one or a combination of: "West_Africa","Central_Africa","East_Africa","Asia" 
  # E.g., region_sel=c("East_Africa", "Asia")

# 2. TABLES 
  # Please define 'tables_sel' from the following options: 
  # a combination of: "site_characteristics_table","threats_table","conservation_activities_table","behaviours_table"
  # e.g.
  
region_sel="all" 
tables_sel=c("site_characteristics_table","threats_table","conservation_activities_table","behaviours_table","challenges_table","ape_status_table")

# Creates "all_tables", which is a list with one entry for each table
all_tables <- get_all_site_tables(root_url, region_sel, tables_sel) 
# The output is a list of dataframes, corresponding to the tables
str(all_tables)