from pathlib import Path from os.path import dirname from snakemake.utils import min_version min_version("7.20") configfile: "config/config.yml" ucmr_data = { 2: "https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip", 3: "https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip", 4: "https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip", } water_reports = { 2022: "https://www.wsscwater.com/sites/default/files/2023-03/2022%20POT%20%26%20PAX%20Tap%20Report.pdf", 2021: "https://www.wsscwater.com/sites/default/files/2022-07/2021%20POT%20%26%20PAX%20Tap%20Report.pdf", 2020: "https://www.wsscwater.com/sites/default/files/2021-04/2020%20POT%20%26%20PAX%20Tap%20Report.pdf", 2019: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2019%20POT%20%26%20PAX%20Tap%20Report.pdf", 2018: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2018%20POT%20%26%20PAX%20Tap%20Report.pdf", 2017: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2017%20POT%20%26%20PAX%20Tap_Report.pdf", 2016: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2016%20PAXPOT%20%20WQR.pdf", 2015: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2015%20POT%20%26%20PAX%20WQR%20Final%20050516.pdf", 2014: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2014%20POT%20%20PAX%20WQR%20Draft%20022715%20Corrected%20030915.pdf", 2013: "https://www.wsscwater.com/files/live/sites/wssc/files/PDFs/TapAnalysis2013_27546.pdf", } rule download_ucmr: output: "resources/ucmr/{ucmr}.zip", params: url=lambda w: ucmr_data[int(w.ucmr)], shell: "curl -sS -L -o {output} {params.url}" rule unzip_ucmr_2: input: expand(rules.download_ucmr.output, ucmr=2), output: "results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt", params: zipdest=lambda _, output: dirname(output[0]), shell: """ rm -rf {params.zipdest} && \ mkdir {params.zipdest} && \ unzip {input} -d {params.zipdest} """ use rule unzip_ucmr_2 as unzip_ucmr_3 with: input: expand(rules.download_ucmr.output, ucmr=3), output: "results/ucmr/ucmr_unzipped_3/UCMR3_All.txt", use rule unzip_ucmr_2 as unzip_ucmr_4 with: input: expand(rules.download_ucmr.output, ucmr=4), output: "results/ucmr/ucmr_unzipped_4/UCMR4_All.txt", # they used a real micro symbol instead of "u", which makes R choke rule fix_ucmr4_data_tbl: input: rules.unzip_ucmr_4.output, output: "results/ucmr/ucmr_unzipped_4/UCMR4_All_fixed.txt", shell: """ cat {input} | sed 's/(\\xB5g\\/L)//' > {output} """ # manually make these data files # 1) download zip from here: https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip # 2) open in localc # 3) save each of the 'DPCache' tables to tsv files (there should be three) rule standardize_ucmr_1: input: expand("resources/ucmr/ucmr1/ucmr1_list1and2chem_final_{i}.tsv", i=[1, 2, 3]), output: "results/ucmr_data/all_std_1.txv.gz", conda: "envs/tidyverse.yml" script: "scripts/standardize_ucmr_1.R" rule standardize_ucmr_2: input: rules.unzip_ucmr_2.output, output: "results/ucmr_data/all_std_2.txv.gz", conda: "envs/tidyverse.yml" script: "scripts/standardize_ucmr_234.R" use rule standardize_ucmr_2 as standardize_ucmr_3 with: input: rules.fix_ucmr4_data_tbl.output, output: "results/ucmr_data/all_std_4.txv.gz", use rule standardize_ucmr_3 as standardize_ucmr_4 with: input: rules.fix_ucmr4_data_tbl.output, output: "results/ucmr_data/all_std_4.txv.gz", rule concat_ucmr: input: rules.standardize_ucmr_1.output, rules.standardize_ucmr_2.output, rules.standardize_ucmr_3.output, rules.standardize_ucmr_4.output, output: "results/ucmr_data/all_std.txv.gz", conda: "envs/tidyverse.yml" script: "scripts/concat_ucmr.R" rule summarize_ucmr: input: rules.concat_ucmr.output, output: tap="results/ucmr_plots/tap.pdf", plant="results/ucmr_plots/plant.pdf", conda: "envs/tidyverse.yml" script: "scripts/summarize_ucmr.R" rule download_wqa_results: output: "resources/wqa/results.zip", shell: """ curl -Ss -q -X POST --header 'Content-Type: application/json' \ --header 'Accept: application/zip' \ -d '{{"countrycode":["US"], "statecode":["US:24"], "countycode":["US:24:031"], "within":"20", "lat":"39.109", "long":"-77.2489", "dataProfile":"resultPhysChem", "providers":["NWIS","STEWARDS","STORET"] }}' \ 'https://www.waterqualitydata.us/data/Result/search?mimeType=tsv&zip=yes' \ > {output} """ rule download_wqa_station: output: "resources/wqa/station.zip", shell: """ curl -Ss -q -X POST --header 'Content-Type: application/json' \ --header 'Accept: application/zip' \ -d '{{"countrycode":["US"], "statecode":["US:24"], "countycode":["US:24:031"], "within":"20", "lat":"39.109", "long":"-77.2489", "providers":["NWIS","STEWARDS","STORET"] }}' \ 'https://www.waterqualitydata.us/data/Station/search?mimeType=tsv&zip=yes' \ > {output} """ use rule unzip_ucmr_2 as unzip_wqa_results with: input: rules.download_wqa_results.output, output: "results/wqa/src/results/resultphyschem.tsv", use rule unzip_ucmr_2 as unzip_wqa_station with: input: rules.download_wqa_station.output, output: "results/wqa/src/station/station.tsv", rule standardize_wqa: input: station=rules.unzip_wqa_station.output, results=rules.unzip_wqa_results.output, output: "results/wqa/process/all.tsv.gz", conda: "envs/tidyverse.yml" script: "scripts/standardize_wqa.R" rule download_water_report: output: "resources/wssc/{year}.pdf", params: url=lambda w: water_reports[int(w.year)], shell: "curl -sS -L -o {output} {params.url}" rule parse_water_report: input: rules.download_water_report.output, output: "results/wssc/{year}.tsv", script: "scripts/wssc_to_table.py" rule cat_reports: input: expand(rules.parse_water_report.output, year=water_reports), output: "results/wssc/all.tsv", shell: "cat {input} > {output}" rule analyse_reports: input: rules.cat_reports.output, output: limit="results/wssc/binned_limit.tsv", nolimit="results/wssc/detected_nolimit.tsv", conda: "envs/tidyverse.yml" script: "scripts/analyze_wssc.R" rule all: input: rules.summarize_ucmr.output, rules.standardize_wqa.output, rules.analyse_reports.output,