2023-04-01 21:39:59 -04:00
|
|
|
from pathlib import Path
|
|
|
|
from os.path import dirname
|
|
|
|
from snakemake.utils import min_version
|
|
|
|
|
|
|
|
min_version("7.20")
|
|
|
|
|
2023-04-05 21:00:46 -04:00
|
|
|
|
2023-04-01 21:39:59 -04:00
|
|
|
configfile: "config/config.yml"
|
|
|
|
|
|
|
|
|
|
|
|
ucmr_data = {
|
2023-04-05 21:00:46 -04:00
|
|
|
2: "https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip",
|
|
|
|
3: "https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip",
|
|
|
|
4: "https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip",
|
|
|
|
}
|
|
|
|
|
|
|
|
water_reports = {
|
|
|
|
2022: "https://www.wsscwater.com/sites/default/files/2023-03/2022%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
|
|
|
2021: "https://www.wsscwater.com/sites/default/files/2022-07/2021%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
|
|
|
2020: "https://www.wsscwater.com/sites/default/files/2021-04/2020%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
|
|
|
2019: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2019%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
|
|
|
2018: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2018%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
|
|
|
2017: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2017%20POT%20%26%20PAX%20Tap_Report.pdf",
|
|
|
|
2016: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2016%20PAXPOT%20%20WQR.pdf",
|
|
|
|
2015: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2015%20POT%20%26%20PAX%20WQR%20Final%20050516.pdf",
|
|
|
|
2014: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2014%20POT%20%20PAX%20WQR%20Draft%20022715%20Corrected%20030915.pdf",
|
|
|
|
2013: "https://www.wsscwater.com/files/live/sites/wssc/files/PDFs/TapAnalysis2013_27546.pdf",
|
2023-04-01 21:39:59 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
rule download_ucmr:
|
|
|
|
output:
|
|
|
|
"resources/ucmr/{ucmr}.zip",
|
|
|
|
params:
|
2023-04-05 21:00:46 -04:00
|
|
|
url=lambda w: ucmr_data[int(w.ucmr)],
|
2023-04-01 21:39:59 -04:00
|
|
|
shell:
|
|
|
|
"curl -sS -L -o {output} {params.url}"
|
|
|
|
|
|
|
|
|
2023-04-05 21:00:46 -04:00
|
|
|
rule unzip_ucmr_2:
|
2023-04-01 21:39:59 -04:00
|
|
|
input:
|
2023-04-05 21:00:46 -04:00
|
|
|
expand(rules.download_ucmr.output, ucmr=2),
|
2023-04-01 21:39:59 -04:00
|
|
|
output:
|
2023-04-05 21:00:46 -04:00
|
|
|
"results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt",
|
2023-04-01 21:39:59 -04:00
|
|
|
params:
|
|
|
|
zipdest=lambda _, output: dirname(output[0]),
|
|
|
|
shell:
|
|
|
|
"""
|
|
|
|
rm -rf {params.zipdest} && \
|
|
|
|
mkdir {params.zipdest} && \
|
|
|
|
unzip {input} -d {params.zipdest}
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2023-04-05 21:00:46 -04:00
|
|
|
use rule unzip_ucmr_2 as unzip_ucmr_3 with:
|
2023-04-01 21:39:59 -04:00
|
|
|
input:
|
|
|
|
expand(rules.download_ucmr.output, ucmr=3),
|
|
|
|
output:
|
|
|
|
"results/ucmr/ucmr_unzipped_3/UCMR3_All.txt",
|
|
|
|
|
|
|
|
|
2023-04-05 21:00:46 -04:00
|
|
|
use rule unzip_ucmr_2 as unzip_ucmr_4 with:
|
2023-04-01 21:39:59 -04:00
|
|
|
input:
|
|
|
|
expand(rules.download_ucmr.output, ucmr=4),
|
|
|
|
output:
|
|
|
|
"results/ucmr/ucmr_unzipped_4/UCMR4_All.txt",
|
|
|
|
|
|
|
|
|
2023-04-05 21:00:46 -04:00
|
|
|
# they used a real micro symbol instead of "u", which makes R choke
|
2023-04-01 21:39:59 -04:00
|
|
|
rule fix_ucmr4_data_tbl:
|
|
|
|
input:
|
|
|
|
rules.unzip_ucmr_4.output,
|
|
|
|
output:
|
|
|
|
"results/ucmr/ucmr_unzipped_4/UCMR4_All_fixed.txt",
|
|
|
|
shell:
|
|
|
|
"""
|
|
|
|
cat {input} | sed 's/(\\xB5g\\/L)//' > {output}
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2023-04-05 21:00:46 -04:00
|
|
|
# manually make these data files
|
|
|
|
# 1) download zip from here: https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip
|
|
|
|
# 2) open in localc
|
|
|
|
# 3) save each of the 'DPCache' tables to tsv files (there should be three)
|
|
|
|
rule standardize_ucmr_1:
|
2023-04-01 21:39:59 -04:00
|
|
|
input:
|
2023-04-05 21:00:46 -04:00
|
|
|
expand("resources/ucmr/ucmr1/ucmr1_list1and2chem_final_{i}.tsv", i=[1, 2, 3]),
|
2023-04-01 21:39:59 -04:00
|
|
|
output:
|
2023-04-05 21:00:46 -04:00
|
|
|
"results/ucmr_data/all_std_1.txv.gz",
|
2023-04-01 21:39:59 -04:00
|
|
|
conda:
|
|
|
|
"envs/tidyverse.yml"
|
|
|
|
script:
|
2023-04-05 21:00:46 -04:00
|
|
|
"scripts/standardize_ucmr_1.R"
|
2023-04-01 21:39:59 -04:00
|
|
|
|
2023-04-05 21:00:46 -04:00
|
|
|
|
|
|
|
rule standardize_ucmr_2:
|
|
|
|
input:
|
|
|
|
rules.unzip_ucmr_2.output,
|
|
|
|
output:
|
|
|
|
"results/ucmr_data/all_std_2.txv.gz",
|
|
|
|
conda:
|
|
|
|
"envs/tidyverse.yml"
|
|
|
|
script:
|
|
|
|
"scripts/standardize_ucmr_234.R"
|
|
|
|
|
|
|
|
|
|
|
|
use rule standardize_ucmr_2 as standardize_ucmr_3 with:
|
2023-04-01 21:39:59 -04:00
|
|
|
input:
|
|
|
|
rules.fix_ucmr4_data_tbl.output,
|
|
|
|
output:
|
|
|
|
"results/ucmr_data/all_std_4.txv.gz",
|
|
|
|
|
|
|
|
|
2023-04-05 21:00:46 -04:00
|
|
|
use rule standardize_ucmr_3 as standardize_ucmr_4 with:
|
|
|
|
input:
|
|
|
|
rules.fix_ucmr4_data_tbl.output,
|
|
|
|
output:
|
|
|
|
"results/ucmr_data/all_std_4.txv.gz",
|
|
|
|
|
2023-04-01 21:39:59 -04:00
|
|
|
|
2023-04-05 21:00:46 -04:00
|
|
|
rule concat_ucmr:
|
2023-04-01 21:39:59 -04:00
|
|
|
input:
|
2023-04-05 21:00:46 -04:00
|
|
|
rules.standardize_ucmr_1.output,
|
|
|
|
rules.standardize_ucmr_2.output,
|
2023-04-01 21:39:59 -04:00
|
|
|
rules.standardize_ucmr_3.output,
|
|
|
|
rules.standardize_ucmr_4.output,
|
2023-04-05 21:00:46 -04:00
|
|
|
output:
|
|
|
|
"results/ucmr_data/all_std.txv.gz",
|
|
|
|
conda:
|
|
|
|
"envs/tidyverse.yml"
|
|
|
|
script:
|
|
|
|
"scripts/concat_ucmr.R"
|
|
|
|
|
|
|
|
|
|
|
|
rule summarize_ucmr:
|
|
|
|
input:
|
|
|
|
rules.concat_ucmr.output,
|
|
|
|
output:
|
|
|
|
tap="results/ucmr_plots/tap.pdf",
|
|
|
|
plant="results/ucmr_plots/plant.pdf",
|
|
|
|
conda:
|
|
|
|
"envs/tidyverse.yml"
|
|
|
|
script:
|
|
|
|
"scripts/summarize_ucmr.R"
|
|
|
|
|
|
|
|
|
|
|
|
rule download_wqa_results:
|
|
|
|
output:
|
|
|
|
"resources/wqa/results.zip",
|
|
|
|
shell:
|
|
|
|
"""
|
|
|
|
curl -Ss -q -X POST --header 'Content-Type: application/json' \
|
|
|
|
--header 'Accept: application/zip' \
|
|
|
|
-d '{{"countrycode":["US"],
|
|
|
|
"statecode":["US:24"],
|
|
|
|
"countycode":["US:24:031"],
|
|
|
|
"within":"20",
|
|
|
|
"lat":"39.109",
|
|
|
|
"long":"-77.2489",
|
|
|
|
"dataProfile":"resultPhysChem",
|
|
|
|
"providers":["NWIS","STEWARDS","STORET"]
|
|
|
|
}}' \
|
|
|
|
'https://www.waterqualitydata.us/data/Result/search?mimeType=tsv&zip=yes' \
|
|
|
|
> {output}
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
rule download_wqa_station:
|
|
|
|
output:
|
|
|
|
"resources/wqa/station.zip",
|
|
|
|
shell:
|
|
|
|
"""
|
|
|
|
curl -Ss -q -X POST --header 'Content-Type: application/json' \
|
|
|
|
--header 'Accept: application/zip' \
|
|
|
|
-d '{{"countrycode":["US"],
|
|
|
|
"statecode":["US:24"],
|
|
|
|
"countycode":["US:24:031"],
|
|
|
|
"within":"20",
|
|
|
|
"lat":"39.109",
|
|
|
|
"long":"-77.2489",
|
|
|
|
"providers":["NWIS","STEWARDS","STORET"]
|
|
|
|
}}' \
|
|
|
|
'https://www.waterqualitydata.us/data/Station/search?mimeType=tsv&zip=yes' \
|
|
|
|
> {output}
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
use rule unzip_ucmr_2 as unzip_wqa_results with:
|
|
|
|
input:
|
|
|
|
rules.download_wqa_results.output,
|
|
|
|
output:
|
|
|
|
"results/wqa/src/results/resultphyschem.tsv",
|
|
|
|
|
|
|
|
|
|
|
|
use rule unzip_ucmr_2 as unzip_wqa_station with:
|
|
|
|
input:
|
|
|
|
rules.download_wqa_station.output,
|
|
|
|
output:
|
|
|
|
"results/wqa/src/station/station.tsv",
|
|
|
|
|
|
|
|
|
|
|
|
rule standardize_wqa:
|
|
|
|
input:
|
|
|
|
station=rules.unzip_wqa_station.output,
|
|
|
|
results=rules.unzip_wqa_results.output,
|
|
|
|
output:
|
|
|
|
"results/wqa/process/all.tsv.gz",
|
|
|
|
conda:
|
|
|
|
"envs/tidyverse.yml"
|
|
|
|
script:
|
|
|
|
"scripts/standardize_wqa.R"
|
|
|
|
|
|
|
|
|
|
|
|
rule download_water_report:
|
|
|
|
output:
|
|
|
|
"resources/wssc/{year}.pdf",
|
|
|
|
params:
|
|
|
|
url=lambda w: water_reports[int(w.year)],
|
|
|
|
shell:
|
|
|
|
"curl -sS -L -o {output} {params.url}"
|
|
|
|
|
|
|
|
|
|
|
|
rule parse_water_report:
|
|
|
|
input:
|
|
|
|
rules.download_water_report.output,
|
|
|
|
output:
|
|
|
|
"results/wssc/{year}.tsv",
|
|
|
|
script:
|
|
|
|
"scripts/wssc_to_table.py"
|
|
|
|
|
|
|
|
|
|
|
|
rule cat_reports:
|
|
|
|
input:
|
|
|
|
expand(rules.parse_water_report.output, year=water_reports),
|
|
|
|
output:
|
|
|
|
"results/wssc/all.tsv",
|
|
|
|
shell:
|
|
|
|
"cat {input} > {output}"
|
|
|
|
|
|
|
|
|
|
|
|
rule analyse_reports:
|
|
|
|
input:
|
|
|
|
rules.cat_reports.output,
|
|
|
|
output:
|
|
|
|
limit="results/wssc/binned_limit.tsv",
|
|
|
|
nolimit="results/wssc/detected_nolimit.tsv",
|
|
|
|
conda:
|
|
|
|
"envs/tidyverse.yml"
|
|
|
|
script:
|
|
|
|
"scripts/analyze_wssc.R"
|
|
|
|
|
|
|
|
|
|
|
|
rule all:
|
|
|
|
input:
|
|
|
|
rules.summarize_ucmr.output,
|
|
|
|
rules.standardize_wqa.output,
|
|
|
|
rules.analyse_reports.output,
|