### rgbif tutorial  [ --> how to use R to access the Global Biodiversity Information Facility database]
### Elizabeth Murray | @PhyloSolving
### April 2019, current address: Smithsonian National Museum of Natural History, Washington, DC


## here is a quick tutorial on using specimen occurrence records to visualize biodiversity data
## find more info on GBIF at: https://www.gbif.org/  (Global Biodiversity Information Facility)
## important - you shouldn't expect that all data in the database is correct/current taxonomy/complete, etc.

## after you try this tutorial, modify the script by replacing the taxon name with your own taxonomic group of interest

# pdf documentation on rgbif: https://cran.r-project.org/web/packages/rgbif/rgbif.pdf


install.packages(c("rgbif", "RColorBrewer", "vioplot")) 
library(rgbif)
library(RColorBrewer)
library(vioplot)

setwd("C:/Users/xx/xx/xx")  #optional, set a folder if you want to save plots, the R script, etc.

## In this example, I'm accessing all records of Eucharitidae (cool-looking wasps) in the US. There are <600 records on GBIF, and only six genera present.
## Eucharitidae = a family of Hymenoptera which are parasitoids of ants (I have a bit of info on the group here, if you're interested: https://www.elizabethmurray.us/eucharitid-ant-parasitoids.html)

###  to get the records of Eucharitidae in the US
occ_euch <- occ_data(scientificName = "Eucharitidae", country = "US", limit=200000)
# note - the max limit of occurrences is 200000, but in this case I know there are far fewer
# at the time of this writing (April 2019) there are 578 records

# check ?occ_data for more info; there are many different arguments that could be entered here
# for instance, we could search for all insects in Minnesota, which are holotypes : 
# #  MN_insect_holo <- occ_data(scientificName = "Insecta", stateProvince = "Minnesota", typeStatus = "holotype", limit=200000)


occ_euch
# all the occurrence data [i.e. all info known for a specimen] is stored in this table, called a tibble
# read more on tibbles at: https://cran.r-project.org/web/packages/tibble/vignettes/tibble.html
# Use options(tibble.print_max = Inf) to always show all rows (otherwise you see the 1st 10); options(tibble.width = Inf)


#take a look at a few of the different categories of data
occ_euch$data[, c(30:33, 38:39)] 
# by using the concatenate function "c()", we can pull out non-sequential columns via their number
# in this instance, since we didn't assign them to an object, we are only viewing them on the console


genus_list <- occ_euch$data$genus
# givs us a list of the diff
genus_list  # take a look -- in this case, there are NAs because no genus was identified
genus_list[is.na(genus_list)] <- "no genus ID"
# for the purpose of plotting, I'm going to replace the NAs with something else
genus_list # the NAs should be gone

plotcol=brewer.pal(9, "Blues")    # diff way to do it: plotcol = palette(topo.colors(5))
genera <- as.factor(genus_list); plot(genera, cex.names= 0.7, main= "number of GBIF records for each genus", col= plotcol)
#this is in alphabetical order, but I'd like a different pattern



### what if we want to order the genera starting from highest number of records?

grouped_list <- tapply(genus_list,genus_list, length)
# here I have combined & counted up all records for each genera
grouped_list # take a look

df_list <- as.data.frame(grouped_list)  # I want it in a different format
df_list  # take a look

df_sort <- df_list[order(-df_list[,"grouped_list"]), , drop = FALSE] # to sort descending
# need to have the drop=FALSE argument in order to keep the structure
# could order the opposite way by removing the '-' df_sort <- df_list[order(df_list[,"grouped_list"]), , drop = FALSE]  # to sort ascending

df_sort  # see the new order of column 1

myorder <- row.names(df_sort)
# I just want the row names (genus names), so that I can order the plot according to them

ordered_genera <- ordered(genus_list, levels = myorder)
num_cols <- nlevels(ordered_genera)

plotcol=brewer.pal(num_cols, "Blues")  
plot(ordered_genera, cex.names= 0.7, main= "number of GBIF records for each eucharitid genus in the US", col= plotcol)

## after all that -- we have a plot ordered by most records to least records, in a pleasing blue ombre




######## more data visualization -- plotting the number of records by year
### here, we can output a few different plots to take a look at the historical pattern of GBIF eucharitid occurances

# remember -- need to have the occurrence data in an object (which was done previously, but if you're just starting here -- do this:
# occ_euch <- occ_data(scientificName = "Eucharitidae", country = "US", limit=200000)

years <- occ_euch$data$year
# make a list of the years that these eucharitids were collected; some records have no year recorded & we'll ignore those


# visualizing data via a violin plot
number_of_records <- table(years)
number_of_records  # take a look
vioplot(number_of_records, names="", col = "darkorange3", main ="number of Eucharitidae collected in a year, according to GBIF records")
# the relative width of the violin = frequency. The width comes from the count of the years that have the same number of records (on the y axis)
# this shows that each year, there are typically VERY FEW records, like, 1-3 records ['no record', or 0s are not considered]
# however, in some years, there are many records (54 is the highest for a single year [in 2010])
## to plot with 0s included, I needed to make a script to pad the missing time series with 0s for years not present; contact me if you're really interested


# I like violin plots
## A violin plot combines box plots and kernel density plots 
# box --> white dot = median value and the black bar = interquartile range (Q1-Q3)
# density --> this is a continuous probability density distribution (not ACTUAL data values)



####### last plots : take a look at two ways of looking at bar charts by year

# this one shows only the years with records
yearsf <- as.factor(years) # this causes the default to be a bar plot by year, otherwise it's a scatter plot with year on the y axis
plot(yearsf, col = ifelse(number_of_records < 10,'cadetblue1','cadetblue4'), cex.names=0.7, las=2, cex.axis=0.8)
title(main="number of eucharitids collected in each year", cex.main=1, line = -1)
#for plot(), cex.names controls the bars, and cex.axis controls y tick labels

# I used conditional colors => color the years that have fewer than 10 records differently than the rest

# here, even 0 record years are included -- i.e., you get ALL the years plotted, even those without any records
plot(number_of_records, col = ifelse(number_of_records < 10,'cadetblue1','cadetblue4'),cex.axis=0.8)
#for plot.xy , cex.axis controls all tick labels --both x and y


#  That was our look at Eucharitidae records in the US! 
#  I think it's great to explore data and use this as a way to learn more about scripting at the same time.

## as of the time of this writing, there were 568 occurrences (records)
## plots will look different as records get added to gbif -- however, more genera are not likely to be added to Eucharitidae, since no others are known to occur in the US