Functions for accessing and preprocessing data

Loading included data

Let’s load the demo data.

#> -------------------- 
#> microbiomedataset version: 0.99.1 
#> -------------------- 
#> 1.expression_data:[ 19216 x 26 data.frame]
#> 2.sample_info:[ 26 x 8 data.frame]
#> 3.variable_info:[ 19216 x 8 data.frame]
#> 4.sample_info_note:[ 8 x 2 data.frame]
#> 5.variable_info_note:[ 8 x 2 data.frame]
#> -------------------- 
#> Processing information (extract_process_info())
#> create_microbiome_dataset ---------- 
#>             Package               Function.used                Time
#> 1 microbiomedataset create_microbiome_dataset() 2022-07-10 10:56:13


#> variables   samples 
#>     19216        26
#> variables 
#>     19216
#> samples 
#>      26
#>  [1] "CL3"      "CC1"      "SV1"      "M31Fcsw"  "M11Fcsw"  "M31Plmr" 
#>  [7] "M11Plmr"  "F21Plmr"  "M31Tong"  "M11Tong"  "LMEpi24M" "SLEpi20M"
#> [13] "AQC1cm"   "AQC4cm"   "AQC7cm"   "NP2"      "NP3"      "NP5"     
#> [19] "TRRsed1"  "TRRsed2"  "TRRsed3"  "TS28"     "TS29"     "Even1"   
#> [25] "Even2"    "Even3"
#> [1] "549322" "522457" "951"    "244423" "586076" "246140"
extract_sample_info(global_patterns) %>% 
#> [1] "sample_id"                "Primer"                  
#> [3] "Final_Barcode"            "Barcode_truncated_plus_T"
#> [5] "Barcode_full_length"      "SampleType"              
#> [7] "Description"              "class"
extract_variable_info(global_patterns) %>% 
#> [1] "variable_id" "Kingdom"     "Phylum"      "Class"       "Order"      
#> [6] "Family"      "Genus"       "Species"
extract_expression_data(global_patterns) %>% 
#>        CL3 CC1 SV1 M31Fcsw M11Fcsw M31Plmr M11Plmr F21Plmr M31Tong M11Tong
#> 549322   0   0   0       0       0       0       0       0       0       0
#> 522457   0   0   0       0       0       0       0       0       0       0
#> 951      0   0   0       0       0       0       1       0       0       0
#> 244423   0   0   0       0       0       0       0       0       0       0
#> 586076   0   0   0       0       0       0       0       0       0       0
#> 246140   0   0   0       0       0       0       0       0       0       0
#>        LMEpi24M SLEpi20M AQC1cm AQC4cm AQC7cm NP2 NP3 NP5 TRRsed1 TRRsed2
#> 549322        0        1     27    100    130   1   0   0       0       0
#> 522457        0        0      0      2      6   0   0   0       0       0
#> 951           0        0      0      0      0   0   0   0       0       0
#> 244423        0        0      0     22     29   0   0   0       0       0
#> 586076        0        0      0      2      1   0   0   0       0       0
#> 246140        0        0      0      1      3   0   0   0       0       0
#>        TRRsed3 TS28 TS29 Even1 Even2 Even3
#> 549322       0    0    0     0     0     0
#> 522457       0    0    0     0     0     0
#> 951          0    0    0     0     0     0
#> 244423       0    0    0     0     0     0
#> 586076       0    0    0     0     0     0
#> 246140       0    0    0     0     0     0
extract_sample_info(global_patterns) %>% 
#>   sample_id  Primer Final_Barcode Barcode_truncated_plus_T Barcode_full_length
#> 1       CL3 ILBC_01        AACGCA                   TGCGTT         CTAGCGTGCGT
#> 2       CC1 ILBC_02        AACTCG                   CGAGTT         CATCGACGAGT
#> 3       SV1 ILBC_03        AACTGT                   ACAGTT         GTACGCACAGT
#> 4   M31Fcsw ILBC_04        AAGAGA                   TCTCTT         TCGACATCTCT
#> 5   M11Fcsw ILBC_05        AAGCTG                   CAGCTT         CGACTGCAGCT
#> 6   M31Plmr ILBC_07        AATCGT                   ACGATT         CGAGTCACGAT
#>   SampleType                                Description   class
#> 1       Soil   Calhoun South Carolina Pine soil, pH 4.9 Subject
#> 2       Soil   Cedar Creek Minnesota, grassland, pH 6.1 Subject
#> 3       Soil Sevilleta new Mexico, desert scrub, pH 8.3 Subject
#> 4      Feces    M3, Day 1, fecal swab, whole body study Subject
#> 5      Feces   M1, Day 1, fecal swab, whole body study  Subject
#> 6       Skin    M3, Day 1, right palm, whole body study Subject
extract_variable_info(global_patterns) %>% 
#>   variable_id Kingdom        Phylum        Class        Order        Family
#> 1      549322 Archaea Crenarchaeota Thermoprotei         <NA>          <NA>
#> 2      522457 Archaea Crenarchaeota Thermoprotei         <NA>          <NA>
#> 3         951 Archaea Crenarchaeota Thermoprotei Sulfolobales Sulfolobaceae
#> 4      244423 Archaea Crenarchaeota        Sd-NA         <NA>          <NA>
#> 5      586076 Archaea Crenarchaeota        Sd-NA         <NA>          <NA>
#> 6      246140 Archaea Crenarchaeota        Sd-NA         <NA>          <NA>
#>        Genus                  Species
#> 1       <NA>                     <NA>
#> 2       <NA>                     <NA>
#> 3 Sulfolobus Sulfolobusacidocaldarius
#> 4       <NA>                     <NA>
#> 5       <NA>                     <NA>
#> 6       <NA>                     <NA>


The microbiomedataset package also includes functions for filtering, subsetting, and merging abundance data.

In the following example, the global_patterns data is first transformed to relative abundance, creating the new global_patterns2 object, which is then filtered such that only OTUs with a mean greater than 10^-5 are kept.

global_patterns2 <-
  global_patterns %>%
  transform2relative_intensity() %>%
  mutate2variable(what = "mean_intensity") %>%
  activate_microbiome_dataset(what = "variable_info") %>%
  filter(mean_intensity > 10 ^ (-5))

This results in a highly-subsetted object, global_patterns2, containing just 4624 of the original ~19216 OTUs.

Next, only remain the variables that phylum Chlamydiae.

global_patterns_chl <-
  global_patterns %>%
  activate_microbiome_dataset(what = "variable_info") %>%
  dplyr::filter(Phylum == "Chlamydiae")

Next, only remain the samples with total intensity > 20.

global_patterns_chl <-
  global_patterns_chl %>%
  mutate2sample(what = "sum_intensity") %>%
  activate_microbiome_dataset(what = "sample_info") %>%
  filter(sum_intensity > 20)

