PLS_DFE/Report.Rmd at main · tbata/PLS_DFE

354 lines (275 loc) · 9.1 KB
title: "PLS models on effect of $N_e$ and generation time on DFE"
subtitle: "summary & visuals using"
author: "Thomas B. & Janek S."
date: "`r Sys.Date()`"
  html_document:
   toc: true
   theme: readable
   higlight: pygments
   toc_float: true
   code_folding: hide
  csv_file: "scratch/reg_vars.csv"  # default can be changed with render; 
  mytree_file: "scratch/science.abn7829_data_s4.nex.tree" # default;
editor_options: 
  chunk_output_type: console
```{r, include=FALSE, warning=FALSE}
# Load the various tidyverse packages
set.seed(42) # for reproducibility of needed
library(tidyverse)
library(ape)
library(nlme) # fpr ML fits of phylo PLS models
library(car) # for vif etc
library(knitr)  # for kable()
library(caper)
library(ggthemes)
library(ggrepel) # for the annotations and prettification
# options for formatting figs & tables
# theme_set(theme_minimal(base_size = 15, base_family = "Futura") )
theme_set(theme_minimal(base_size = 15, base_family = "Verdana") )
# theme_set(theme_minimal(base_size = 15, base_family = "Helvetica") )
options(pillar.sigfig = 3) 
geom_pt_size <- 1.4
```{r, global_options, include=FALSE}
# Suppressing warnings, etc. to get more readable reports (ONLY use when checked)
# knitr::opts_chunk$set(message=FALSE, tidy.opts=list(width.cutoff=60), tidy=TRUE) 
knitr::opts_chunk$set(message=FALSE, tidy.opts=list(width.cutoff=60)) 
# Reading in the csv 
The so-called `reg_file` used for the analysis and this report is : `r tools::file_path_sans_ext(basename(params$csv_file))`
# reg_file  <- "scratch/reg_vars.csv" # all del file shared
reg_file  <- params$csv_file # all del file shared
# reg_file  <- "scratch/reg_vars.gamma.del.csv"
mytree_file <- "scratch/science.abn7829_data_s4.nex.tree"
tree_file <- mytree_file
summary_csv_tbl <- read_csv(reg_file, show_col_types = FALSE)
# A glimpse of data
summary_csv_tbl %>%
  head() %>%
  kable(digits = 2)
# Formatting and filtering .. 
df <- summary_csv_tbl %>%
  filter(! (is.na(generation_time))) %>%
  rename(species = population) %>%
    species = ifelse(
      str_count(species, "_") >= 2,
      sub("(_[^_]+)$", "", species),
      species
    ), # remove subspecies postfix if present
    logNe = log10(Ne)
  group_by(species) %>% # avoid pseudo reps by aggregating ssp as a single point
  summarise(
    Ne = mean(Ne, na.rm = TRUE), # average Ne across subspecies
    range_inf_m10 = mean(.data$`range_inf_-10`, na.rm = TRUE),
    range_m1_0 = mean(.data$`range_-1_0`, na.rm = TRUE),
    logNe = mean(logNe, na.rm = TRUE),
    generation_time = mean(generation_time, na.rm = TRUE), # average gt across subspecies
    .groups = "drop"
  kable(digits = 2)
# Phylogeny tree and reformatting of tips 
The phylogenetic tree used for this analysis is `r params$mytree_file` 
primphylo <- read.tree(params$mytree_file)
# keep only overlapping taxa and align rows to tree tip order
common <- intersect(primphylo$tip.label, df$species)
# drop tips not in common and filter/arrange df to match tree tip order
primphylo <- drop.tip(primphylo, setdiff(primphylo$tip.label, common))
# plot(primphylo, type = "radial", cex = 0.6)
plot(primphylo, cex = 0.6)
df <- df %>%
  filter(species %in% primphylo$tip.label) %>%
  mutate(species = factor(species, levels = primphylo$tip.label)) %>%
  arrange(species)
dim(df) # Only 29 species left ...
# Regression models 
form <- range_inf_m10 ~ logNe + generation_time
model_OLS <- lm(form, data = df)
vif(model_OLS) 
ggplot(data = df, aes(x= generation_time, y = logNe))+
  geom_point(size = geom_pt_size)+
  geom_smooth(se=F)+
  geom_smooth(method="lm", color = "darkorange", se=F)
## PGLS models
# PGLS Brownian
model_PGLS_brownian <- gls(
  form, data = df,
  correlation = corBrownian(phy = primphylo, form = ~species),
  method = "ML"
# PGLS Grafen
model_PGLS_grafen <- gls(
  form, data = df,
  correlation = corGrafen(phy = primphylo, form = ~species, value = 0.8, fixed = FALSE),
  method = "ML"
## Pagel model 
```{r, echo=TRUE, include=TRUE}
# Fit PGLS with Pagel's lambda estimated by ML
#first create some Comparative dataset 
comp29 <- comparative.data(primphylo, as.data.frame(df), 
                           names.col = species, vcv = TRUE)
# 1. Estimated lambda (ML)
mod_lambda_ML <- pgls(range_inf_m10 ~ logNe + generation_time, 
                     data = comp29, lambda = "ML")
# 2. Lambda = 0 (equivalent to OLS)
mod_lambda_0 <- pgls(range_inf_m10 ~ logNe + generation_time, 
                     data = comp29,  lambda = 0.00001)
# Extract lambda estimate
mod_lambda_ML$param["lambda"]
mod_lambda_ML$aic
mod_lambda_0$param["lambda"]
mod_lambda_0$aic
## Summary of models 
Below, we build summary table (AIC, pred R^2, coefficients) :
predR2 <- function(y, fit) cor(y, fit, use = "complete.obs")^2
  predR2(df$range_inf_m10, fitted(model_OLS)),
  predR2(df$range_inf_m10, fitted(model_PGLS_brownian)),
  predR2(df$range_inf_m10, fitted(model_PGLS_grafen))
pvals <- rbind(
  summary(model_OLS)$coefficients[, "Pr(>|t|)"],
  summary(model_PGLS_brownian)$tTable[, "p-value"],
  summary(model_PGLS_grafen)$tTable[, "p-value"]
tbl_sum <- rbind(
  round(coef(model_OLS), 3),
  round(coef(model_PGLS_brownian), 3),
  round(coef(model_PGLS_grafen), 3)
models_spec <- c("OLS", "PGLS Brownian", "PGLS Grafen")
models_aic <- c(
  round(AIC(model_OLS), 2),
  round(AIC(model_PGLS_brownian), 2),
  round(AIC(model_PGLS_grafen), 2)
out_tbl <- cbind(
  `Regression Model` = models_spec,
  AIC = models_aic,
  `R^2` = round(r2s, 3),
  Intercept = tbl_sum[, 1],
  `p(Intercept)` = signif(pvals[, 1], 3),
  log10Ne = tbl_sum[, 2],
  `p(log10Ne)` = signif(pvals[, 2], 3),
  gen_time = tbl_sum[, 3],
  `p(gen_time)` = signif(pvals[, 3], 3)
kable(out_tbl, format = "pandoc", digits = 2)
# extract Grafen's model rho
model_PGLS_grafen$modelStruct$corStruct
  ggplot(aes(x=logNe, y=range_inf_m10, label = species, color=generation_time))+
  geom_point(size=geom_pt_size)+
  geom_smooth(method = "lm", se=F, size = 0.4)+
  # geom_abline(slope = model_PGLS_brownian$coefficients[2],
  #             intercept = model_PGLS_brownian$coefficients[1],
  #             color="orange")+
  # geom_abline(slope = model_PGLS_grafen$coefficients[2],
  #             intercept = model_PGLS_grafen$coefficients[1],
  #             color="magenta")+
  # geom_abline(slope = model_OLS$coefficients[2], 
  #             intercept = model_OLS$coefficients[1],
  #             color="orange")+
  xlab("Log Effective size")+
  ylab("Prop of v. deleterious mutations")+
  geom_text_repel(size=2.5, max.overlaps = 10,
                  min.segment.length = 0.25,point.padding = 0.2)+
  theme(legend.position = "inside",
        legend.position.inside = c(0.85, 0.25))+
  scale_color_viridis_b(direction = +1)+
  ggplot(aes(x=logNe, y=range_m1_0, label = species, 
             color=generation_time))+
  geom_point(size=geom_pt_size)+
  geom_smooth(method = "lm", se=F, size = 0.4)+
  xlab("Log Effective size")+
  ylab("Prop of slight del. mutations")+
  geom_text_repel(size=2.5, max.overlaps = 10,
                  min.segment.length = 0.25,point.padding = 0.2)+
  theme(legend.position = "inside",
        legend.position.inside = c(0.85, 0.75))+
  scale_color_viridis_b(direction = +1)+
# One last visual : generation time versus strongly del. 
  ggplot(aes(x=generation_time, y=range_inf_m10, label = species, 
             color=logNe))+
  geom_point(size=geom_pt_size)+
  geom_smooth(method = "lm", se=T, size = 0.4)+
  # geom_smooth(method = "loess", se=T, size = 0.4)+
  xlab("Generation time")+
  ylab("Prop of strongly del. mutations")+
  geom_text_repel(size=2.5, max.overlaps = 10,
                  min.segment.length = 0.25,point.padding = 0.2)+
  theme(legend.position = "inside",
        legend.position.inside = c(0.9, 0.82))+
  scale_color_viridis_b(direction = +1)+
# Appendix : visuals on regression checks 
model_OLS_tbl <- tibble(epsilon = model_OLS$residuals,
                        fitted = model_OLS$fitted.values) 
model_OLS_tbl|>
ggplot(aes(x=fitted, y = epsilon))+
  geom_point()+
  geom_smooth(method="lm")+
model_OLS_tbl|>
  ggplot(aes(x=epsilon))+
  geom_histogram(bins = 8, color="grey50")+
shapiro.test(x = model_OLS_tbl$epsilon)
## Brownian
model_brownian_tbl <- tibble(epsilon = model_PGLS_brownian$residuals,
                        fitted = model_PGLS_brownian$fitted) 
model_brownian_tbl|>
  ggplot(aes(x=fitted, y = epsilon))+
  geom_point()+
  geom_smooth(method="lm")+
model_brownian_tbl|>
  ggplot(aes(x=epsilon))+
  geom_histogram(bins = 8, color="grey50")+
shapiro.test(x = model_brownian_tbl$epsilon)
model_grafen_tbl <- tibble(epsilon = model_PGLS_grafen$residuals,
                             fitted = model_PGLS_grafen$fitted) 
model_grafen_tbl|>
  ggplot(aes(x=fitted, y = epsilon))+
  geom_point()+
  geom_smooth(method="lm")+
model_grafen_tbl|>
  ggplot(aes(x=epsilon))+
  geom_histogram(bins = 8, color="grey50")+
shapiro.test(x = model_grafen_tbl$epsilon)
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

Report.Rmd

Latest commit

History

Report.Rmd

File metadata and controls