This notebook formats for downstream use the table and VCF output from mutect2 called on WGS population data in the evolved bacteria inoculum. These are species that had been experimentally evolved before the main match/mis-match experiment. The formatting removes some variants with spurious features and connects variants to functional annotations and gene identities.
Show/hide code
library(here)library(tidyverse)library(withr)library(fs)library(archive)source(here::here("R", "utils_generic.R"))# Set up some directories -------------------------------------------------data_raw <- here::here("_data_raw", "wgs")vcftar <- here::here(data_raw, "tables.tar.gz")# create temporary location to decompresstmpdir <- fs::file_temp()# make processed data directory if it doesn't existdata <- here::here("data", "wgs")fs::dir_create(data)# untar directory containing tables archive::archive_extract( vcftar,dir = tmpdir,files =NULL,options =character(),strip_components =0L)vcfdir <- here::here(tmpdir, "tables")# Read variant SnpEff annotations -----------------------------------------snpefffiles <- fs::dir_ls(path = vcfdir,all =FALSE,recurse =TRUE,type ="file",glob ="*.snpeff.tsv",regexp =NULL,invert =FALSE,fail =TRUE)snpeffslurped <- readr::read_tsv( snpefffiles,skip =1,col_names =c("chrom","pos","ref","alt","filter",# Annotated using Sequence Ontology terms. Multiple effects can be concatenated using '&'."effect",# A simple estimation of putative impact / deleteriousness : HIGH, MODERATE, LOW, or MODIFIER"impact",# gene id from prokka"locus_tag",# whether variant is coding or noncoding"biotype",# Variant using HGVS notation (DNA level)"hgvs_c",# Variant using HGVS notation (Protein level)."hgvs_p",# Position in coding bases (one based includes START and STOP codons)."cds_pos",# Total number of coding bases (one based includes START and STOP codons)."cds_len",# Position in translated product (one based includes START and STOP codons)."aa_pos",# Total length of translated product (one based includes START and STOP codons)."aa_len" ),col_types =c("cdcccccccccdddd"),id ="sample")snpeffslurpedfmt <- snpeffslurped %>%# this is weird, but joining on column like filter that contains characters like ';' doesnt work dplyr::select(-filter) %>%mutate(sample =str_extract(sample, regex("(HAMBI_[:digit:]{4}-[:alpha:]-[:alnum:]{3})|(HAMBI_[:digit:]{4}-[:alpha:]-SH-WGS-[:digit:]{3})"))) %>%separate(sample, into =c("strainID", "replicate", "sample"), sep="-", extra ="merge")# Read variant frequencies ------------------------------------------------varfiles <- fs::dir_ls(path = vcfdir,all =FALSE,recurse =TRUE,type ="file",glob ="*.variants.tsv",regexp =NULL,invert =FALSE,fail =TRUE)varslurped <- readr::read_tsv( varfiles,skip =1,col_names =c("chrom","pos","ref","alt","filter","type",# Allelic depths for the ref and alt alleles in the order listed"ad",# Allele fractions of alternate alleles. Excludes filtering"freq_alt_raw",# Approximate read depth (reads are filtered if MQ=255 or with bad mates)"depth_total",# Count of fragments supporting each allele."frag_allele_depth",# Genotype Quality"genotype_qual" ),col_types =c("cdccccccccc"),id ="sample")varslurpedfmt <- varslurped %>%mutate(sample =str_extract(sample, regex("(HAMBI_[:digit:]{4}-[:alpha:]-[:alnum:]{3})|(HAMBI_[:digit:]{4}-[:alpha:]-SH-WGS-[:digit:]{3})"))) %>%separate(sample, into =c("strainID", "replicate", "sample"), sep="-", extra ="merge") %>%# taken only first two most abundant allelesseparate(ad, into =c("depth_ref", "depth_alt"), sep=",", extra="drop") %>%# format depths to numbericmutate(depth_ref =as.numeric(depth_ref), depth_alt =as.numeric(depth_alt)) %>%select(-frag_allele_depth, -genotype_qual) %>%mutate(freq_alt = depth_alt/(depth_ref + depth_alt),freq_ref =1- freq_alt) %>%# final format depths to numericmutate(freq_alt_raw =as.numeric(freq_alt_raw),depth_total =as.numeric(depth_total))# Read premade annotations -----------------------------------------------------annotations <-read_rds(here::here("_data_raw", "shared", "annotations_codon_degeneracy.rds"))# Combine -----------------------------------------------------------------full <-left_join(varslurpedfmt, snpeffslurpedfmt,by =join_by(strainID, replicate, sample, chrom, pos, ref, alt)) %>%left_join(annotations,by =join_by(strainID, chrom, locus_tag))# Write output ------------------------------------------------------------write_rds(full, here::here(data, "mutect_parsed_annotated.rds"))# Clean up ----------------------------------------------------------------# remove decompressed vcf and tables directory from temp locationfs::dir_delete(vcfdir)
Source Code
---title: "Format WGS variants"author: "Shane Hogle"date: todayabstract: "This notebook formats for downstream use the table and VCF output from mutect2 called on WGS population data in the evolved bacteria inoculum. These are species that had been experimentally evolved before the main match/mis-match experiment. The formatting removes some variants with spurious features and connects variants to functional annotations and gene identities."---```{r}#| output: false#| warning: false#| error: falselibrary(here)library(tidyverse)library(withr)library(fs)library(archive)source(here::here("R", "utils_generic.R"))# Set up some directories -------------------------------------------------data_raw <- here::here("_data_raw", "wgs")vcftar <- here::here(data_raw, "tables.tar.gz")# create temporary location to decompresstmpdir <- fs::file_temp()# make processed data directory if it doesn't existdata <- here::here("data", "wgs")fs::dir_create(data)# untar directory containing tables archive::archive_extract( vcftar,dir = tmpdir,files =NULL,options =character(),strip_components =0L)vcfdir <- here::here(tmpdir, "tables")# Read variant SnpEff annotations -----------------------------------------snpefffiles <- fs::dir_ls(path = vcfdir,all =FALSE,recurse =TRUE,type ="file",glob ="*.snpeff.tsv",regexp =NULL,invert =FALSE,fail =TRUE)snpeffslurped <- readr::read_tsv( snpefffiles,skip =1,col_names =c("chrom","pos","ref","alt","filter",# Annotated using Sequence Ontology terms. Multiple effects can be concatenated using '&'."effect",# A simple estimation of putative impact / deleteriousness : HIGH, MODERATE, LOW, or MODIFIER"impact",# gene id from prokka"locus_tag",# whether variant is coding or noncoding"biotype",# Variant using HGVS notation (DNA level)"hgvs_c",# Variant using HGVS notation (Protein level)."hgvs_p",# Position in coding bases (one based includes START and STOP codons)."cds_pos",# Total number of coding bases (one based includes START and STOP codons)."cds_len",# Position in translated product (one based includes START and STOP codons)."aa_pos",# Total length of translated product (one based includes START and STOP codons)."aa_len" ),col_types =c("cdcccccccccdddd"),id ="sample")snpeffslurpedfmt <- snpeffslurped %>%# this is weird, but joining on column like filter that contains characters like ';' doesnt work dplyr::select(-filter) %>%mutate(sample =str_extract(sample, regex("(HAMBI_[:digit:]{4}-[:alpha:]-[:alnum:]{3})|(HAMBI_[:digit:]{4}-[:alpha:]-SH-WGS-[:digit:]{3})"))) %>%separate(sample, into =c("strainID", "replicate", "sample"), sep="-", extra ="merge")# Read variant frequencies ------------------------------------------------varfiles <- fs::dir_ls(path = vcfdir,all =FALSE,recurse =TRUE,type ="file",glob ="*.variants.tsv",regexp =NULL,invert =FALSE,fail =TRUE)varslurped <- readr::read_tsv( varfiles,skip =1,col_names =c("chrom","pos","ref","alt","filter","type",# Allelic depths for the ref and alt alleles in the order listed"ad",# Allele fractions of alternate alleles. Excludes filtering"freq_alt_raw",# Approximate read depth (reads are filtered if MQ=255 or with bad mates)"depth_total",# Count of fragments supporting each allele."frag_allele_depth",# Genotype Quality"genotype_qual" ),col_types =c("cdccccccccc"),id ="sample")varslurpedfmt <- varslurped %>%mutate(sample =str_extract(sample, regex("(HAMBI_[:digit:]{4}-[:alpha:]-[:alnum:]{3})|(HAMBI_[:digit:]{4}-[:alpha:]-SH-WGS-[:digit:]{3})"))) %>%separate(sample, into =c("strainID", "replicate", "sample"), sep="-", extra ="merge") %>%# taken only first two most abundant allelesseparate(ad, into =c("depth_ref", "depth_alt"), sep=",", extra="drop") %>%# format depths to numbericmutate(depth_ref =as.numeric(depth_ref), depth_alt =as.numeric(depth_alt)) %>%select(-frag_allele_depth, -genotype_qual) %>%mutate(freq_alt = depth_alt/(depth_ref + depth_alt),freq_ref =1- freq_alt) %>%# final format depths to numericmutate(freq_alt_raw =as.numeric(freq_alt_raw),depth_total =as.numeric(depth_total))# Read premade annotations -----------------------------------------------------annotations <-read_rds(here::here("_data_raw", "shared", "annotations_codon_degeneracy.rds"))# Combine -----------------------------------------------------------------full <-left_join(varslurpedfmt, snpeffslurpedfmt,by =join_by(strainID, replicate, sample, chrom, pos, ref, alt)) %>%left_join(annotations,by =join_by(strainID, chrom, locus_tag))# Write output ------------------------------------------------------------write_rds(full, here::here(data, "mutect_parsed_annotated.rds"))# Clean up ----------------------------------------------------------------# remove decompressed vcf and tables directory from temp locationfs::dir_delete(vcfdir)```