commit 3f68fbbbe6839eda4ad450ce63b78c31018cbc80 Author: ndwarshuis Date: Sat Apr 1 21:39:59 2023 -0400 init commit diff --git a/config/config.yml b/config/config.yml new file mode 100644 index 0000000..19c4d21 --- /dev/null +++ b/config/config.yml @@ -0,0 +1 @@ +my_pswid: MD0150005 diff --git a/env.yml b/env.yml new file mode 100644 index 0000000..8661570 --- /dev/null +++ b/env.yml @@ -0,0 +1,7 @@ +name: moco-water +channels: + - conda-forge +dependencies: + - snakemake-minimal=7.25.0 + - mamba + - snakefmt diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 0000000..d33cc65 --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,109 @@ +from pathlib import Path +from os.path import dirname +from snakemake.utils import min_version + +min_version("7.20") + +configfile: "config/config.yml" + + +ucmr_data = { + 1: ( + "https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip", + "ucmr1_list1and2chem_final.xls", + ), + 2: ( + "https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip", + "UCMR2_All_OccurrenceData_Jan12.txt", + ), + 3: ( + "https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip", + "UCMR3_All.txt", + ), + 4: ( + "https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip", + "UCMR4_All.txt", + ), +} + + +rule download_ucmr: + output: + "resources/ucmr/{ucmr}.zip", + params: + url=lambda w: ucmr_data[int(w.ucmr)][0], + shell: + "curl -sS -L -o {output} {params.url}" + + +rule unzip_ucmr_1: + input: + expand(rules.download_ucmr.output, ucmr=1), + output: + "results/ucmr/ucmr_unzipped_1/ucmr1_list1and2chem_final.xls", + params: + zipdest=lambda _, output: dirname(output[0]), + shell: + """ + rm -rf {params.zipdest} && \ + mkdir {params.zipdest} && \ + unzip {input} -d {params.zipdest} + """ + + +use rule unzip_ucmr_1 as unzip_ucmr_2 with: + input: + expand(rules.download_ucmr.output, ucmr=2), + output: + "results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt", + + +use rule unzip_ucmr_1 as unzip_ucmr_3 with: + input: + expand(rules.download_ucmr.output, ucmr=3), + output: + "results/ucmr/ucmr_unzipped_3/UCMR3_All.txt", + + +use rule unzip_ucmr_1 as unzip_ucmr_4 with: + input: + expand(rules.download_ucmr.output, ucmr=4), + output: + "results/ucmr/ucmr_unzipped_4/UCMR4_All.txt", + + +rule fix_ucmr4_data_tbl: + input: + rules.unzip_ucmr_4.output, + output: + "results/ucmr/ucmr_unzipped_4/UCMR4_All_fixed.txt", + shell: + """ + cat {input} | sed 's/(\\xB5g\\/L)//' > {output} + """ + + +rule standardize_ucmr_3: + input: + rules.unzip_ucmr_3.output, + output: + "results/ucmr_data/all_std_3.txv.gz", + conda: + "envs/tidyverse.yml" + script: + "scripts/standardize_ucmr_34.R" + +use rule standardize_ucmr_3 as standardize_ucmr_4 with: + input: + rules.fix_ucmr4_data_tbl.output, + output: + "results/ucmr_data/all_std_4.txv.gz", + + + +rule all: + input: + rules.standardize_ucmr_3.output, + rules.standardize_ucmr_4.output, + # expand(rules.unzip_ucmr.output, ucmr=[1, 2, 3]), + # rules.fix_ucmr4_data_tbl.output, diff --git a/workflow/envs/tidyverse.yml b/workflow/envs/tidyverse.yml new file mode 100644 index 0000000..ba1c4bc --- /dev/null +++ b/workflow/envs/tidyverse.yml @@ -0,0 +1,7 @@ +name: moco-water-tidyverse +channels: + - conda-forge +dependencies: + - r-tidyverse + - r-styler + - r-lintr diff --git a/workflow/scripts/standardize_ucmr_34.R b/workflow/scripts/standardize_ucmr_34.R new file mode 100644 index 0000000..90f060e --- /dev/null +++ b/workflow/scripts/standardize_ucmr_34.R @@ -0,0 +1,21 @@ +library(tidyverse) + +snakemake@input[[1]] %>% + readr::read_tsv( + col_types = cols( + PWSID = "c", + PWSName = "c", + FacilityName = "c", + SamplePointID = "c", + SamplePointName = "c", + SamplePointType = "c", + CollectionDate = "c", + Contaminant = "c", + AnalyticalResultsSign = "c", + AnalyticalResultValue = "d", + .default = "-" + )) %>% + mutate(CollectionDate = as.Date(CollectionDate)) %>% + rename(value = "AnalyticalResultValue") %>% + filter(PWSID == snakemake@config[["my_pswid"]]) %>% + readr::write_tsv(snakemake@output[[1]])