Process and format amplicon count data

Author

Shane Hogle

Published

October 30, 2024

Abstract
This notebook formats 16S rRNA amplicon counts that have been mapped to HAMBI 16S genes.

1 Setup

Libraries and global variables

Show/hide code
library(here)
library(tidyverse)
library(fs)
library(archive)
source(here::here("R", "utils_generic.R"))

Set up some directories

Show/hide code
data_raw <- here::here("_data_raw", "illumina_v3v4")
amptar <- here::here(data_raw, "bbmap_rpkm.tar.gz")

# create temporary location to decompress
tmpdir <- fs::file_temp()

# make processed data directory if it doesn't exist
data <- here::here("data", "illumina_v3v4")
fs::dir_create(data)

Untar and decompress

Show/hide code
# untar directory containing variant tables 
archive::archive_extract(
  amptar,
  dir = tmpdir,
  files = NULL,
  options = character(),
  strip_components = 0L
)

ampdir <- here::here(tmpdir, "bbmap_rpkm")

2 Reading and small formatting of data

Coverage data

Show/hide code
ampfiles <- fs::dir_ls(
  path = ampdir,
  all = FALSE,
  recurse = TRUE,
  type = "file",
  glob = "*.rpkm",
  regexp = NULL,
  invert = FALSE,
  fail = TRUE
)

ampslurped <- readr::read_tsv(
  ampfiles,
  comment = "#",
  col_names = c(
    "strainID",
    "Length",
    "Bases",
    "Coverage",
    "count",
    "RPKM",
    "Frags",
    "FPKM"
  ),
  col_types = "cddddddd",
  id = "file_name"
)

format the data nicely

Show/hide code
ampslurpedfmt <- ampslurped %>%
  mutate(sample = str_extract(file_name, "HAMBI[:digit:]{4}")) %>%
  left_join(tax, by = join_by(strainID)) %>% 
  dplyr::select(sample, strainID, genus, species, count) %>% 
  mutate(strainID = str_replace(strainID, "-", "_"))

cleanup temporary directory

Show/hide code
fs::dir_delete(tmpdir)
Show/hide code
ampslurpedfmtnrm <- ampslurpedfmt %>% 
  mutate(
    count = case_when(
      # trunc ensures we get integers. important for some alpha div measures
      .$strainID == "HAMBI_1842" ~ trunc(.$count/4),
      .$strainID == "HAMBI_2659" ~ trunc(.$count/2),
      TRUE ~ as.numeric(.$count)
    ))

3 Export

Show/hide code
write_tsv(ampslurpedfmtnrm, here(data, "species_counts.tsv"))