from pathlib import Path from os.path import dirname from snakemake.utils import min_version min_version("7.20") configfile: "config/config.yml" ucmr_data = { 1: ( "https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip", "ucmr1_list1and2chem_final.xls", ), 2: ( "https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip", "UCMR2_All_OccurrenceData_Jan12.txt", ), 3: ( "https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip", "UCMR3_All.txt", ), 4: ( "https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip", "UCMR4_All.txt", ), } rule download_ucmr: output: "resources/ucmr/{ucmr}.zip", params: url=lambda w: ucmr_data[int(w.ucmr)][0], shell: "curl -sS -L -o {output} {params.url}" rule unzip_ucmr_1: input: expand(rules.download_ucmr.output, ucmr=1), output: "results/ucmr/ucmr_unzipped_1/ucmr1_list1and2chem_final.xls", params: zipdest=lambda _, output: dirname(output[0]), shell: """ rm -rf {params.zipdest} && \ mkdir {params.zipdest} && \ unzip {input} -d {params.zipdest} """ use rule unzip_ucmr_1 as unzip_ucmr_2 with: input: expand(rules.download_ucmr.output, ucmr=2), output: "results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt", use rule unzip_ucmr_1 as unzip_ucmr_3 with: input: expand(rules.download_ucmr.output, ucmr=3), output: "results/ucmr/ucmr_unzipped_3/UCMR3_All.txt", use rule unzip_ucmr_1 as unzip_ucmr_4 with: input: expand(rules.download_ucmr.output, ucmr=4), output: "results/ucmr/ucmr_unzipped_4/UCMR4_All.txt", rule fix_ucmr4_data_tbl: input: rules.unzip_ucmr_4.output, output: "results/ucmr/ucmr_unzipped_4/UCMR4_All_fixed.txt", shell: """ cat {input} | sed 's/(\\xB5g\\/L)//' > {output} """ rule standardize_ucmr_3: input: rules.unzip_ucmr_3.output, output: "results/ucmr_data/all_std_3.txv.gz", conda: "envs/tidyverse.yml" script: "scripts/standardize_ucmr_34.R" use rule standardize_ucmr_3 as standardize_ucmr_4 with: input: rules.fix_ucmr4_data_tbl.output, output: "results/ucmr_data/all_std_4.txv.gz", rule all: input: rules.standardize_ucmr_3.output, rules.standardize_ucmr_4.output, # expand(rules.unzip_ucmr.output, ucmr=[1, 2, 3]), # rules.fix_ucmr4_data_tbl.output,