moco-water/workflow/Snakefile

110 lines
2.6 KiB
Plaintext
Raw Normal View History

2023-04-01 21:39:59 -04:00
from pathlib import Path
from os.path import dirname
from snakemake.utils import min_version
min_version("7.20")
configfile: "config/config.yml"
ucmr_data = {
1: (
"https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip",
"ucmr1_list1and2chem_final.xls",
),
2: (
"https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip",
"UCMR2_All_OccurrenceData_Jan12.txt",
),
3: (
"https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip",
"UCMR3_All.txt",
),
4: (
"https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip",
"UCMR4_All.txt",
),
}
rule download_ucmr:
output:
"resources/ucmr/{ucmr}.zip",
params:
url=lambda w: ucmr_data[int(w.ucmr)][0],
shell:
"curl -sS -L -o {output} {params.url}"
rule unzip_ucmr_1:
input:
expand(rules.download_ucmr.output, ucmr=1),
output:
"results/ucmr/ucmr_unzipped_1/ucmr1_list1and2chem_final.xls",
params:
zipdest=lambda _, output: dirname(output[0]),
shell:
"""
rm -rf {params.zipdest} && \
mkdir {params.zipdest} && \
unzip {input} -d {params.zipdest}
"""
use rule unzip_ucmr_1 as unzip_ucmr_2 with:
input:
expand(rules.download_ucmr.output, ucmr=2),
output:
"results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt",
use rule unzip_ucmr_1 as unzip_ucmr_3 with:
input:
expand(rules.download_ucmr.output, ucmr=3),
output:
"results/ucmr/ucmr_unzipped_3/UCMR3_All.txt",
use rule unzip_ucmr_1 as unzip_ucmr_4 with:
input:
expand(rules.download_ucmr.output, ucmr=4),
output:
"results/ucmr/ucmr_unzipped_4/UCMR4_All.txt",
rule fix_ucmr4_data_tbl:
input:
rules.unzip_ucmr_4.output,
output:
"results/ucmr/ucmr_unzipped_4/UCMR4_All_fixed.txt",
shell:
"""
cat {input} | sed 's/(\\xB5g\\/L)//' > {output}
"""
rule standardize_ucmr_3:
input:
rules.unzip_ucmr_3.output,
output:
"results/ucmr_data/all_std_3.txv.gz",
conda:
"envs/tidyverse.yml"
script:
"scripts/standardize_ucmr_34.R"
use rule standardize_ucmr_3 as standardize_ucmr_4 with:
input:
rules.fix_ucmr4_data_tbl.output,
output:
"results/ucmr_data/all_std_4.txv.gz",
rule all:
input:
rules.standardize_ucmr_3.output,
rules.standardize_ucmr_4.output,
# expand(rules.unzip_ucmr.output, ucmr=[1, 2, 3]),
# rules.fix_ucmr4_data_tbl.output,