ADD lots of current wssc analysis
This commit is contained in:
parent
3f68fbbbe6
commit
0578a0ed89
|
@ -1 +1,2 @@
|
||||||
my_pswid: MD0150005
|
my_pswid: MD0150005
|
||||||
|
my_plant_id: "0100000"
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
Full Traceback (most recent call last):
|
||||||
|
File "/home/ndwar/.local/share/conda/envs/moco-water/lib/python3.11/site-packages/snakemake/__init__.py", line 643, in snakemake
|
||||||
|
workflow.include(
|
||||||
|
File "/home/ndwar/.local/share/conda/envs/moco-water/lib/python3.11/site-packages/snakemake/workflow.py", line 1238, in include
|
||||||
|
exec(compile(code, snakefile.get_path_or_uri(), "exec"), self.globals)
|
||||||
|
File "/mnt/data/Dvl/env/home/water/workflow/Snakefile", line 7, in <module>
|
||||||
|
configfile: "config/config.yml"
|
||||||
|
File "/home/ndwar/.local/share/conda/envs/moco-water/lib/python3.11/site-packages/snakemake/workflow.py", line 1311, in configfile
|
||||||
|
raise WorkflowError(
|
||||||
|
snakemake.exceptions.WorkflowError: Workflow defines configfile config/config.yml but it is not present or accessible (full checked path: /mnt/data/Dvl/env/home/water/workflow/config/config.yml).
|
||||||
|
|
||||||
|
WorkflowError in file /mnt/data/Dvl/env/home/water/workflow/Snakefile, line 7:
|
||||||
|
Workflow defines configfile config/config.yml but it is not present or accessible (full checked path: /mnt/data/Dvl/env/home/water/workflow/config/config.yml).
|
||||||
|
File "/mnt/data/Dvl/env/home/water/workflow/Snakefile", line 7, in <module>
|
|
@ -0,0 +1,3 @@
|
||||||
|
WorkflowError in file /mnt/data/Dvl/env/home/water/workflow/Snakefile, line 7:
|
||||||
|
Workflow defines configfile config/config.yml but it is not present or accessible (full checked path: /mnt/data/Dvl/env/home/water/workflow/config/config.yml).
|
||||||
|
File "/mnt/data/Dvl/env/home/water/workflow/Snakefile", line 7, in <module>
|
|
@ -4,26 +4,27 @@ from snakemake.utils import min_version
|
||||||
|
|
||||||
min_version("7.20")
|
min_version("7.20")
|
||||||
|
|
||||||
|
|
||||||
configfile: "config/config.yml"
|
configfile: "config/config.yml"
|
||||||
|
|
||||||
|
|
||||||
ucmr_data = {
|
ucmr_data = {
|
||||||
1: (
|
2: "https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip",
|
||||||
"https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip",
|
3: "https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip",
|
||||||
"ucmr1_list1and2chem_final.xls",
|
4: "https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip",
|
||||||
),
|
}
|
||||||
2: (
|
|
||||||
"https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip",
|
water_reports = {
|
||||||
"UCMR2_All_OccurrenceData_Jan12.txt",
|
2022: "https://www.wsscwater.com/sites/default/files/2023-03/2022%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
||||||
),
|
2021: "https://www.wsscwater.com/sites/default/files/2022-07/2021%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
||||||
3: (
|
2020: "https://www.wsscwater.com/sites/default/files/2021-04/2020%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
||||||
"https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip",
|
2019: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2019%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
||||||
"UCMR3_All.txt",
|
2018: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2018%20POT%20%26%20PAX%20Tap%20Report.pdf",
|
||||||
),
|
2017: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2017%20POT%20%26%20PAX%20Tap_Report.pdf",
|
||||||
4: (
|
2016: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2016%20PAXPOT%20%20WQR.pdf",
|
||||||
"https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip",
|
2015: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2015%20POT%20%26%20PAX%20WQR%20Final%20050516.pdf",
|
||||||
"UCMR4_All.txt",
|
2014: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2014%20POT%20%20PAX%20WQR%20Draft%20022715%20Corrected%20030915.pdf",
|
||||||
),
|
2013: "https://www.wsscwater.com/files/live/sites/wssc/files/PDFs/TapAnalysis2013_27546.pdf",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,16 +32,16 @@ rule download_ucmr:
|
||||||
output:
|
output:
|
||||||
"resources/ucmr/{ucmr}.zip",
|
"resources/ucmr/{ucmr}.zip",
|
||||||
params:
|
params:
|
||||||
url=lambda w: ucmr_data[int(w.ucmr)][0],
|
url=lambda w: ucmr_data[int(w.ucmr)],
|
||||||
shell:
|
shell:
|
||||||
"curl -sS -L -o {output} {params.url}"
|
"curl -sS -L -o {output} {params.url}"
|
||||||
|
|
||||||
|
|
||||||
rule unzip_ucmr_1:
|
rule unzip_ucmr_2:
|
||||||
input:
|
input:
|
||||||
expand(rules.download_ucmr.output, ucmr=1),
|
expand(rules.download_ucmr.output, ucmr=2),
|
||||||
output:
|
output:
|
||||||
"results/ucmr/ucmr_unzipped_1/ucmr1_list1and2chem_final.xls",
|
"results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt",
|
||||||
params:
|
params:
|
||||||
zipdest=lambda _, output: dirname(output[0]),
|
zipdest=lambda _, output: dirname(output[0]),
|
||||||
shell:
|
shell:
|
||||||
|
@ -51,27 +52,21 @@ rule unzip_ucmr_1:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
use rule unzip_ucmr_1 as unzip_ucmr_2 with:
|
use rule unzip_ucmr_2 as unzip_ucmr_3 with:
|
||||||
input:
|
|
||||||
expand(rules.download_ucmr.output, ucmr=2),
|
|
||||||
output:
|
|
||||||
"results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt",
|
|
||||||
|
|
||||||
|
|
||||||
use rule unzip_ucmr_1 as unzip_ucmr_3 with:
|
|
||||||
input:
|
input:
|
||||||
expand(rules.download_ucmr.output, ucmr=3),
|
expand(rules.download_ucmr.output, ucmr=3),
|
||||||
output:
|
output:
|
||||||
"results/ucmr/ucmr_unzipped_3/UCMR3_All.txt",
|
"results/ucmr/ucmr_unzipped_3/UCMR3_All.txt",
|
||||||
|
|
||||||
|
|
||||||
use rule unzip_ucmr_1 as unzip_ucmr_4 with:
|
use rule unzip_ucmr_2 as unzip_ucmr_4 with:
|
||||||
input:
|
input:
|
||||||
expand(rules.download_ucmr.output, ucmr=4),
|
expand(rules.download_ucmr.output, ucmr=4),
|
||||||
output:
|
output:
|
||||||
"results/ucmr/ucmr_unzipped_4/UCMR4_All.txt",
|
"results/ucmr/ucmr_unzipped_4/UCMR4_All.txt",
|
||||||
|
|
||||||
|
|
||||||
|
# they used a real micro symbol instead of "u", which makes R choke
|
||||||
rule fix_ucmr4_data_tbl:
|
rule fix_ucmr4_data_tbl:
|
||||||
input:
|
input:
|
||||||
rules.unzip_ucmr_4.output,
|
rules.unzip_ucmr_4.output,
|
||||||
|
@ -83,15 +78,38 @@ rule fix_ucmr4_data_tbl:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
rule standardize_ucmr_3:
|
# manually make these data files
|
||||||
|
# 1) download zip from here: https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip
|
||||||
|
# 2) open in localc
|
||||||
|
# 3) save each of the 'DPCache' tables to tsv files (there should be three)
|
||||||
|
rule standardize_ucmr_1:
|
||||||
input:
|
input:
|
||||||
rules.unzip_ucmr_3.output,
|
expand("resources/ucmr/ucmr1/ucmr1_list1and2chem_final_{i}.tsv", i=[1, 2, 3]),
|
||||||
output:
|
output:
|
||||||
"results/ucmr_data/all_std_3.txv.gz",
|
"results/ucmr_data/all_std_1.txv.gz",
|
||||||
conda:
|
conda:
|
||||||
"envs/tidyverse.yml"
|
"envs/tidyverse.yml"
|
||||||
script:
|
script:
|
||||||
"scripts/standardize_ucmr_34.R"
|
"scripts/standardize_ucmr_1.R"
|
||||||
|
|
||||||
|
|
||||||
|
rule standardize_ucmr_2:
|
||||||
|
input:
|
||||||
|
rules.unzip_ucmr_2.output,
|
||||||
|
output:
|
||||||
|
"results/ucmr_data/all_std_2.txv.gz",
|
||||||
|
conda:
|
||||||
|
"envs/tidyverse.yml"
|
||||||
|
script:
|
||||||
|
"scripts/standardize_ucmr_234.R"
|
||||||
|
|
||||||
|
|
||||||
|
use rule standardize_ucmr_2 as standardize_ucmr_3 with:
|
||||||
|
input:
|
||||||
|
rules.fix_ucmr4_data_tbl.output,
|
||||||
|
output:
|
||||||
|
"results/ucmr_data/all_std_4.txv.gz",
|
||||||
|
|
||||||
|
|
||||||
use rule standardize_ucmr_3 as standardize_ucmr_4 with:
|
use rule standardize_ucmr_3 as standardize_ucmr_4 with:
|
||||||
input:
|
input:
|
||||||
|
@ -100,10 +118,140 @@ use rule standardize_ucmr_3 as standardize_ucmr_4 with:
|
||||||
"results/ucmr_data/all_std_4.txv.gz",
|
"results/ucmr_data/all_std_4.txv.gz",
|
||||||
|
|
||||||
|
|
||||||
|
rule concat_ucmr:
|
||||||
|
input:
|
||||||
|
rules.standardize_ucmr_1.output,
|
||||||
|
rules.standardize_ucmr_2.output,
|
||||||
|
rules.standardize_ucmr_3.output,
|
||||||
|
rules.standardize_ucmr_4.output,
|
||||||
|
output:
|
||||||
|
"results/ucmr_data/all_std.txv.gz",
|
||||||
|
conda:
|
||||||
|
"envs/tidyverse.yml"
|
||||||
|
script:
|
||||||
|
"scripts/concat_ucmr.R"
|
||||||
|
|
||||||
|
|
||||||
|
rule summarize_ucmr:
|
||||||
|
input:
|
||||||
|
rules.concat_ucmr.output,
|
||||||
|
output:
|
||||||
|
tap="results/ucmr_plots/tap.pdf",
|
||||||
|
plant="results/ucmr_plots/plant.pdf",
|
||||||
|
conda:
|
||||||
|
"envs/tidyverse.yml"
|
||||||
|
script:
|
||||||
|
"scripts/summarize_ucmr.R"
|
||||||
|
|
||||||
|
|
||||||
|
rule download_wqa_results:
|
||||||
|
output:
|
||||||
|
"resources/wqa/results.zip",
|
||||||
|
shell:
|
||||||
|
"""
|
||||||
|
curl -Ss -q -X POST --header 'Content-Type: application/json' \
|
||||||
|
--header 'Accept: application/zip' \
|
||||||
|
-d '{{"countrycode":["US"],
|
||||||
|
"statecode":["US:24"],
|
||||||
|
"countycode":["US:24:031"],
|
||||||
|
"within":"20",
|
||||||
|
"lat":"39.109",
|
||||||
|
"long":"-77.2489",
|
||||||
|
"dataProfile":"resultPhysChem",
|
||||||
|
"providers":["NWIS","STEWARDS","STORET"]
|
||||||
|
}}' \
|
||||||
|
'https://www.waterqualitydata.us/data/Result/search?mimeType=tsv&zip=yes' \
|
||||||
|
> {output}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
rule download_wqa_station:
|
||||||
|
output:
|
||||||
|
"resources/wqa/station.zip",
|
||||||
|
shell:
|
||||||
|
"""
|
||||||
|
curl -Ss -q -X POST --header 'Content-Type: application/json' \
|
||||||
|
--header 'Accept: application/zip' \
|
||||||
|
-d '{{"countrycode":["US"],
|
||||||
|
"statecode":["US:24"],
|
||||||
|
"countycode":["US:24:031"],
|
||||||
|
"within":"20",
|
||||||
|
"lat":"39.109",
|
||||||
|
"long":"-77.2489",
|
||||||
|
"providers":["NWIS","STEWARDS","STORET"]
|
||||||
|
}}' \
|
||||||
|
'https://www.waterqualitydata.us/data/Station/search?mimeType=tsv&zip=yes' \
|
||||||
|
> {output}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
use rule unzip_ucmr_2 as unzip_wqa_results with:
|
||||||
|
input:
|
||||||
|
rules.download_wqa_results.output,
|
||||||
|
output:
|
||||||
|
"results/wqa/src/results/resultphyschem.tsv",
|
||||||
|
|
||||||
|
|
||||||
|
use rule unzip_ucmr_2 as unzip_wqa_station with:
|
||||||
|
input:
|
||||||
|
rules.download_wqa_station.output,
|
||||||
|
output:
|
||||||
|
"results/wqa/src/station/station.tsv",
|
||||||
|
|
||||||
|
|
||||||
|
rule standardize_wqa:
|
||||||
|
input:
|
||||||
|
station=rules.unzip_wqa_station.output,
|
||||||
|
results=rules.unzip_wqa_results.output,
|
||||||
|
output:
|
||||||
|
"results/wqa/process/all.tsv.gz",
|
||||||
|
conda:
|
||||||
|
"envs/tidyverse.yml"
|
||||||
|
script:
|
||||||
|
"scripts/standardize_wqa.R"
|
||||||
|
|
||||||
|
|
||||||
|
rule download_water_report:
|
||||||
|
output:
|
||||||
|
"resources/wssc/{year}.pdf",
|
||||||
|
params:
|
||||||
|
url=lambda w: water_reports[int(w.year)],
|
||||||
|
shell:
|
||||||
|
"curl -sS -L -o {output} {params.url}"
|
||||||
|
|
||||||
|
|
||||||
|
rule parse_water_report:
|
||||||
|
input:
|
||||||
|
rules.download_water_report.output,
|
||||||
|
output:
|
||||||
|
"results/wssc/{year}.tsv",
|
||||||
|
script:
|
||||||
|
"scripts/wssc_to_table.py"
|
||||||
|
|
||||||
|
|
||||||
|
rule cat_reports:
|
||||||
|
input:
|
||||||
|
expand(rules.parse_water_report.output, year=water_reports),
|
||||||
|
output:
|
||||||
|
"results/wssc/all.tsv",
|
||||||
|
shell:
|
||||||
|
"cat {input} > {output}"
|
||||||
|
|
||||||
|
|
||||||
|
rule analyse_reports:
|
||||||
|
input:
|
||||||
|
rules.cat_reports.output,
|
||||||
|
output:
|
||||||
|
limit="results/wssc/binned_limit.tsv",
|
||||||
|
nolimit="results/wssc/detected_nolimit.tsv",
|
||||||
|
conda:
|
||||||
|
"envs/tidyverse.yml"
|
||||||
|
script:
|
||||||
|
"scripts/analyze_wssc.R"
|
||||||
|
|
||||||
|
|
||||||
rule all:
|
rule all:
|
||||||
input:
|
input:
|
||||||
rules.standardize_ucmr_3.output,
|
rules.summarize_ucmr.output,
|
||||||
rules.standardize_ucmr_4.output,
|
rules.standardize_wqa.output,
|
||||||
# expand(rules.unzip_ucmr.output, ucmr=[1, 2, 3]),
|
rules.analyse_reports.output,
|
||||||
# rules.fix_ucmr4_data_tbl.output,
|
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
split_df <- function(df, test) {
|
||||||
|
list(i = filter(df, {{ test }}), o = filter(df, !{{ test }}))
|
||||||
|
}
|
||||||
|
|
||||||
|
path <- snakemake@input[[1]]
|
||||||
|
|
||||||
|
df <- readr::read_tsv(
|
||||||
|
path,
|
||||||
|
col_types = "iccddddddd",
|
||||||
|
col_names =
|
||||||
|
c("year",
|
||||||
|
"species",
|
||||||
|
"unit",
|
||||||
|
"ave_lower",
|
||||||
|
"ave_upper",
|
||||||
|
"min_lower",
|
||||||
|
"min_upper",
|
||||||
|
"max_lower",
|
||||||
|
"max_upper",
|
||||||
|
"limit"
|
||||||
|
)
|
||||||
|
) %>%
|
||||||
|
# there are some TTHM/HHA5 entries in here twice, use the ones with limits
|
||||||
|
filter(!str_detect(species, "(TTHM|HAA5)") | limit > 0)
|
||||||
|
|
||||||
|
has_limit <- df %>%
|
||||||
|
group_by(species) %>%
|
||||||
|
summarize(limit = max(limit)) %>%
|
||||||
|
filter(limit > 0) %>%
|
||||||
|
pull(species)
|
||||||
|
|
||||||
|
limited <- split_df(df, species %in% has_limit)
|
||||||
|
|
||||||
|
binned_limit <- limited$i %>%
|
||||||
|
group_by(species) %>%
|
||||||
|
summarize(av = max(ave_upper), mx = max(max_upper), limit = max(limit), .groups = "drop") %>%
|
||||||
|
mutate(bin = case_when(mx == 0 ~ "undetected",
|
||||||
|
mx > limit ~ "over",
|
||||||
|
mx > limit / 10 ~ "over10",
|
||||||
|
mx > limit / 100 ~ "over100",
|
||||||
|
TRUE ~ "safeIGuess")) %>%
|
||||||
|
filter(bin != "undetected") %>%
|
||||||
|
arrange(bin, species) %>%
|
||||||
|
readr::write_tsv(snakemake@output[["limit"]])
|
||||||
|
|
||||||
|
detected_nolimit <- limited$o %>%
|
||||||
|
group_by(species) %>%
|
||||||
|
summarize(av = max(ave_upper), mx = max(max_upper)) %>%
|
||||||
|
mutate(detected = mx > 0) %>%
|
||||||
|
filter(detected) %>%
|
||||||
|
arrange(species) %>%
|
||||||
|
readr::write_tsv(snakemake@output[["nolimit"]])
|
|
@ -0,0 +1,6 @@
|
||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
snakemake@input %>%
|
||||||
|
map_dfr(~ readr::read_tsv(.x, col_types = "c")) %>%
|
||||||
|
replace_na(list(value = 0)) %>%
|
||||||
|
readr::write_tsv(snakemake@output[[1]])
|
|
@ -0,0 +1,31 @@
|
||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
read_tsv <- function(path) {
|
||||||
|
readr::read_tsv(
|
||||||
|
path,
|
||||||
|
col_types = cols(
|
||||||
|
PWSID = "c",
|
||||||
|
PWSName = "c",
|
||||||
|
"Facility ID" = "c",
|
||||||
|
"Sample point ID" = "c",
|
||||||
|
"Sample point type" = "c",
|
||||||
|
"Sample collection date" = "c",
|
||||||
|
Contaminant = "c",
|
||||||
|
"Unit measure" = "c",
|
||||||
|
Result = "d",
|
||||||
|
.default = "-"
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
df <- snakemake@input %>%
|
||||||
|
map_dfr(read_tsv) %>%
|
||||||
|
rename(sample_point_ID = "Sample point ID",
|
||||||
|
sample_point_type = "Sample point type",
|
||||||
|
facility_ID = "Facility ID",
|
||||||
|
date = "Sample collection date",
|
||||||
|
species = Contaminant,
|
||||||
|
unit = "Unit measure",
|
||||||
|
value = Result) %>%
|
||||||
|
mutate(date = as.Date(date, format = "%m/%d/%y")) %>%
|
||||||
|
readr::write_tsv(snakemake@output[[1]])
|
||||||
|
|
|
@ -5,6 +5,7 @@ snakemake@input[[1]] %>%
|
||||||
col_types = cols(
|
col_types = cols(
|
||||||
PWSID = "c",
|
PWSID = "c",
|
||||||
PWSName = "c",
|
PWSName = "c",
|
||||||
|
FacilityID = "c",
|
||||||
FacilityName = "c",
|
FacilityName = "c",
|
||||||
SamplePointID = "c",
|
SamplePointID = "c",
|
||||||
SamplePointName = "c",
|
SamplePointName = "c",
|
||||||
|
@ -15,7 +16,15 @@ snakemake@input[[1]] %>%
|
||||||
AnalyticalResultValue = "d",
|
AnalyticalResultValue = "d",
|
||||||
.default = "-"
|
.default = "-"
|
||||||
)) %>%
|
)) %>%
|
||||||
mutate(CollectionDate = as.Date(CollectionDate)) %>%
|
rename(
|
||||||
rename(value = "AnalyticalResultValue") %>%
|
sample_point_ID = SamplePointID,
|
||||||
filter(PWSID == snakemake@config[["my_pswid"]]) %>%
|
sample_point_name = SamplePointName,
|
||||||
|
sample_point_type = SamplePointType,
|
||||||
|
facility_ID = FacilityID,
|
||||||
|
facility_Name = FacilityName,
|
||||||
|
date = CollectionDate,
|
||||||
|
species = Contaminant,
|
||||||
|
unit = AnalyticalResultsSign,
|
||||||
|
value = AnalyticalResultValue) %>%
|
||||||
|
mutate(date = as.Date(date, format = "%m/%d/%Y")) %>%
|
||||||
readr::write_tsv(snakemake@output[[1]])
|
readr::write_tsv(snakemake@output[[1]])
|
|
@ -0,0 +1,107 @@
|
||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
|
||||||
|
site_df <- readr::read_tsv(
|
||||||
|
snakemake@input[["station"]],
|
||||||
|
col_types = cols(
|
||||||
|
MonitoringLocationIdentifier = "f",
|
||||||
|
MonitoringLocationTypeName = "f",
|
||||||
|
MonitoringLocationName = "f",
|
||||||
|
MonitoringLocationDescriptionText = "c",
|
||||||
|
LatitudeMeasure = "d",
|
||||||
|
LongitudeMeasure = "d",
|
||||||
|
.default = "-")
|
||||||
|
) %>%
|
||||||
|
rename(
|
||||||
|
location = MonitoringLocationIdentifier,
|
||||||
|
location_type = MonitoringLocationTypeName,
|
||||||
|
location_name = MonitoringLocationName,
|
||||||
|
location_desc = MonitoringLocationDescriptionText,
|
||||||
|
lat = LatitudeMeasure,
|
||||||
|
long = LongitudeMeasure,
|
||||||
|
)
|
||||||
|
|
||||||
|
# This has a bunch of crap in it that has nothing to do with chemicals in
|
||||||
|
# water (which might make amphibians gay). Additionally, there are many
|
||||||
|
# different units that need to be standardized (eventually to be put in
|
||||||
|
# terms of ug/ml)
|
||||||
|
result_df <- readr::read_tsv(
|
||||||
|
snakemake@input[["results"]],
|
||||||
|
col_types = cols(
|
||||||
|
ActivityTypeCode = "f",
|
||||||
|
ActivityStartDate = "D",
|
||||||
|
ActivityEndDate = "D",
|
||||||
|
ActivityMediaName = "f",
|
||||||
|
MonitoringLocationIdentifier = "f",
|
||||||
|
CharacteristicName = "f",
|
||||||
|
ResultMeasureValue = "c",
|
||||||
|
"ResultMeasure/MeasureUnitCode" = "c",
|
||||||
|
"DetectionQuantitationLimitMeasure/MeasureValue" = "c",
|
||||||
|
"DetectionQuantitationLimitMeasure/MeasureUnitCode" = "f",
|
||||||
|
.default = "-",
|
||||||
|
)
|
||||||
|
) %>%
|
||||||
|
rename(activity = ActivityTypeCode,
|
||||||
|
unit = "ResultMeasure/MeasureUnitCode",
|
||||||
|
media = ActivityMediaName,
|
||||||
|
start = ActivityStartDate,
|
||||||
|
end = ActivityEndDate,
|
||||||
|
location = MonitoringLocationIdentifier,
|
||||||
|
value = ResultMeasureValue,
|
||||||
|
limit = "DetectionQuantitationLimitMeasure/MeasureValue",
|
||||||
|
limit_unit = "DetectionQuantitationLimitMeasure/MeasureUnitCode",
|
||||||
|
species = CharacteristicName) %>%
|
||||||
|
arrange(start) %>%
|
||||||
|
filter(media == "Water") %>%
|
||||||
|
select(-media) %>%
|
||||||
|
# select values that are numbers (some are just descriptive strings) assuming
|
||||||
|
# blanks are actually zeros (to be filtered out later)
|
||||||
|
replace_na(list(value = "0")) %>%
|
||||||
|
filter(str_detect(value, "^-?\\d+\\.?\\d*$")) %>%
|
||||||
|
mutate(value = as.numeric(value)) %>%
|
||||||
|
# select units that are mass concentrations or "counts per something"
|
||||||
|
filter(str_detect(unit, "^.*g/.*(l|L|g)$") |
|
||||||
|
unit %in% c("%", "ppb", "ppm")) %>%
|
||||||
|
# remove a bunch of crap with "%" units
|
||||||
|
filter(! str_detect(species, "SSC|Cloud cover|Sediment|solids|demand")) %>%
|
||||||
|
filter(! str_detect(species, "Pha?eophytin|Chlorophyll|Alkalinity")) %>%
|
||||||
|
filter(species != "Sodium, percent total cations" # not sure what this means
|
||||||
|
& species != "Dissolved oxygen saturation" # whatever
|
||||||
|
& species != "Water" # ironic...
|
||||||
|
& species != "Barometric pressure" # not a chemical
|
||||||
|
& species != "Relative humidity" # not a chemical either
|
||||||
|
& species != "Extract volume" # ''
|
||||||
|
& species != "Volume, total" # ''
|
||||||
|
& species != "Acidity, (H+)" # will change later
|
||||||
|
& species != "Carbon dioxide" # ditto
|
||||||
|
& species != "Dissolved oxygen (DO)" # ditto
|
||||||
|
& species != "Total hardness" # not specific
|
||||||
|
) %>%
|
||||||
|
# these seems like a typos
|
||||||
|
mutate(species = case_when(
|
||||||
|
species == "Diazinon0" ~ "Diazinon",
|
||||||
|
species == "Phosphate-phosphorus***retired***use Total Phosphorus, mixed forms" ~ "Total Phosphorus, mixed forms",
|
||||||
|
species == "Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite" ~ "Nitrate + Nitrite",
|
||||||
|
TRUE ~ species
|
||||||
|
)) %>%
|
||||||
|
# collapse units to (f/n/u/m)g/L
|
||||||
|
mutate(unit = str_replace(unit, "/l", "/L"),
|
||||||
|
unit = str_replace(unit, "kg", "L"),
|
||||||
|
unit = case_when(unit == "ppb" ~ "ug/L",
|
||||||
|
unit == "ppm" ~ "mg/L",
|
||||||
|
TRUE ~ unit),
|
||||||
|
# percent will just be in mg/L
|
||||||
|
value = if_else(unit == "%", value * 10, value),
|
||||||
|
unit = if_else(unit == "%", "mg/L", unit)) %>%
|
||||||
|
# standardize all values to ug/L
|
||||||
|
mutate(std_value = case_when(unit == "g/L" ~ value * 1e6,
|
||||||
|
unit == "mg/L" ~ value * 1e3,
|
||||||
|
unit == "ng/L" ~ value / 1e3,
|
||||||
|
unit == "fg/L" ~ value / 1e6,
|
||||||
|
TRUE ~ value)) %>%
|
||||||
|
select(-value, -unit)
|
||||||
|
|
||||||
|
result_df %>%
|
||||||
|
left_join(site_df, by = "location") %>%
|
||||||
|
select(-location) %>%
|
||||||
|
readr::write_tsv(snakemake@output[[1]])
|
|
@ -0,0 +1,32 @@
|
||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
df <- readr::read_tsv(
|
||||||
|
snakemake@input[[1]],
|
||||||
|
col_types = cols(date = "D", value = "d", .default = "c")
|
||||||
|
) %>%
|
||||||
|
filter(PWSID == snakemake@config[["my_pswid"]]) %>%
|
||||||
|
select(-PWSID, -PWSName)
|
||||||
|
|
||||||
|
# purification plant sample point
|
||||||
|
df %>%
|
||||||
|
filter(sample_point_ID == snakemake@config[["my_plant_id"]]) %>%
|
||||||
|
filter(sample_point_type == "EP") %>%
|
||||||
|
ggplot(aes(value, fct_rev(species))) +
|
||||||
|
geom_jitter() +
|
||||||
|
scale_x_log10() +
|
||||||
|
labs(x = "level (ug/ml)",
|
||||||
|
y = "species",
|
||||||
|
title = "Purification Plant")
|
||||||
|
ggsave(snakemake@output[["plant"]])
|
||||||
|
|
||||||
|
# tap sample points
|
||||||
|
df %>%
|
||||||
|
filter(sample_point_type %in% c("DS", "MR")) %>%
|
||||||
|
ggplot(aes(value, fct_rev(species))) +
|
||||||
|
geom_jitter() +
|
||||||
|
scale_x_log10() +
|
||||||
|
labs(x = "level (ug/ml)",
|
||||||
|
y = "species",
|
||||||
|
title = "Purification Plant")
|
||||||
|
ggsave(snakemake@output[["tap"]])
|
||||||
|
|
|
@ -0,0 +1,120 @@
|
||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
split_df <- function(df, flt) {
|
||||||
|
.in <- df %>%
|
||||||
|
filter({{ flt }})
|
||||||
|
.out <- df %>%
|
||||||
|
filter(! {{ flt }})
|
||||||
|
list(i = .in,
|
||||||
|
o = .out)
|
||||||
|
}
|
||||||
|
|
||||||
|
df <- readr::read_tsv(
|
||||||
|
"../../results/wqa/process/all.tsv.gz",
|
||||||
|
col_types = cols(
|
||||||
|
start = "D",
|
||||||
|
species = "f",
|
||||||
|
std_value = "d",
|
||||||
|
lat = "d",
|
||||||
|
long = "d",
|
||||||
|
location_name = "c",
|
||||||
|
.default = "-"
|
||||||
|
)
|
||||||
|
) %>%
|
||||||
|
# get rid of the deuterium distinction on some pharma species
|
||||||
|
mutate(species = str_replace(species, "-(d|D)\\d", ""))
|
||||||
|
|
||||||
|
not_detected <- df %>%
|
||||||
|
group_by(species) %>%
|
||||||
|
summarize(total = sum(std_value)) %>%
|
||||||
|
filter(total == 0) %>%
|
||||||
|
pull(species)
|
||||||
|
|
||||||
|
|
||||||
|
harmless <- c(
|
||||||
|
"Sodium",
|
||||||
|
"Bicarbonate",
|
||||||
|
"Calcium",
|
||||||
|
"Magnesium",
|
||||||
|
"Potassium",
|
||||||
|
"Carbonate",
|
||||||
|
"Oxygen",
|
||||||
|
"Silica"
|
||||||
|
)
|
||||||
|
|
||||||
|
df_detected <- df %>%
|
||||||
|
filter(! species %in% not_detected) %>%
|
||||||
|
filter(! species %in% harmless)
|
||||||
|
|
||||||
|
df %>%
|
||||||
|
filter(lubridate::year(start) > 1990) %>%
|
||||||
|
group_by(species) %>%
|
||||||
|
summarize(fraction = mean(std_value > 0),
|
||||||
|
n = n()) %>%
|
||||||
|
mutate(stderr = sqrt(fraction * (1 - fraction) / n)) %>%
|
||||||
|
filter(n > 3) %>%
|
||||||
|
ggplot(aes(fraction, fct_reorder(species, fraction))) +
|
||||||
|
geom_col() +
|
||||||
|
geom_errorbarh(aes(xmin = fraction - stderr, xmax = fraction + stderr))
|
||||||
|
|
||||||
|
metals <- c(
|
||||||
|
"Lithium",
|
||||||
|
"Beryllium",
|
||||||
|
"Boron",
|
||||||
|
"Aluminum",
|
||||||
|
"Vanadium",
|
||||||
|
"Chromium",
|
||||||
|
"Manganese",
|
||||||
|
"Iron",
|
||||||
|
"Cobalt",
|
||||||
|
"Nickel",
|
||||||
|
"Copper",
|
||||||
|
"Zinc",
|
||||||
|
"Arsenic",
|
||||||
|
"Selenium",
|
||||||
|
"Strontium",
|
||||||
|
"Molybdenum",
|
||||||
|
"Silver",
|
||||||
|
"Cadmium",
|
||||||
|
"Antimony",
|
||||||
|
"Barium",
|
||||||
|
"Mercury",
|
||||||
|
"Thallium",
|
||||||
|
"Lead",
|
||||||
|
"Uranium"
|
||||||
|
)
|
||||||
|
|
||||||
|
halides <- c("Chloride", "Fluoride", "Bromide")
|
||||||
|
|
||||||
|
.nitro <- split_df(df_detected, str_detect(species, "(n|N)itr")
|
||||||
|
| str_detect(species, "Ammon"))
|
||||||
|
|
||||||
|
.phospho <- split_df(.nitro$o, str_detect(species, "(P|p)hosph"))
|
||||||
|
|
||||||
|
.metal <- split_df(.phospho$o, species %in% metals)
|
||||||
|
|
||||||
|
.halides <- split_df(.metal$o, species %in% halides)
|
||||||
|
|
||||||
|
.nitro$i %>%
|
||||||
|
ggplot(aes(start, std_value, color = species, group = species)) +
|
||||||
|
geom_line()
|
||||||
|
|
||||||
|
.halides$i %>%
|
||||||
|
ggplot(aes(start, std_value)) +
|
||||||
|
geom_point() +
|
||||||
|
facet_wrap(scales = "free", c("species"))
|
||||||
|
|
||||||
|
.metal$i %>%
|
||||||
|
ggplot(aes(start, std_value)) +
|
||||||
|
geom_point() +
|
||||||
|
facet_wrap(scales = "free", c("species"))
|
||||||
|
|
||||||
|
.phospho$i %>%
|
||||||
|
ggplot(aes(start, std_value)) +
|
||||||
|
geom_point() +
|
||||||
|
facet_wrap(scales = "free", c("species"))
|
||||||
|
|
||||||
|
.halides$o %>%
|
||||||
|
filter(std_value > 1) %>%
|
||||||
|
ggplot(aes(std_value, species)) +
|
||||||
|
geom_jitter()
|
|
@ -0,0 +1,89 @@
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import subprocess as sp
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import NamedTuple
|
||||||
|
|
||||||
|
|
||||||
|
class CsvRow(NamedTuple):
|
||||||
|
year: int
|
||||||
|
species: str
|
||||||
|
unit: str
|
||||||
|
average_lower: float
|
||||||
|
average_upper: float
|
||||||
|
min_lower: float
|
||||||
|
min_upper: float
|
||||||
|
max_lower: float
|
||||||
|
max_upper: float
|
||||||
|
limit: float
|
||||||
|
|
||||||
|
|
||||||
|
def fmt_float(x: str) -> float:
|
||||||
|
if x == "n/d" or x == "n.d":
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
# spaces sometimes show up if there is a superscript
|
||||||
|
return float(x.split(" ")[0])
|
||||||
|
|
||||||
|
|
||||||
|
def fmt_lt(x: str) -> tuple[float, float]:
|
||||||
|
if "<" == x[0]:
|
||||||
|
return (0, fmt_float(x.removeprefix("<")))
|
||||||
|
else:
|
||||||
|
return (y := fmt_float(x), y)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_chemical(year, line: list[str]) -> CsvRow:
|
||||||
|
try:
|
||||||
|
limit = float(re.match("\d+(\.\d+)?", line[5])[0])
|
||||||
|
except (TypeError, ValueError, IndexError):
|
||||||
|
limit = -1
|
||||||
|
a = fmt_lt(line[2])
|
||||||
|
mx = fmt_lt(line[3])
|
||||||
|
mi = fmt_lt(line[4])
|
||||||
|
return CsvRow(
|
||||||
|
year=year,
|
||||||
|
species=line[0],
|
||||||
|
unit=line[1].replace("µ", "u"),
|
||||||
|
average_lower=a[0],
|
||||||
|
average_upper=a[1],
|
||||||
|
min_lower=mi[0],
|
||||||
|
min_upper=mi[1],
|
||||||
|
max_lower=mx[0],
|
||||||
|
max_upper=mx[1],
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pdf(year: int, ipath: Path) -> list[CsvRow]:
|
||||||
|
res = sp.run(
|
||||||
|
["pdftotext", "-f", "1", "-l", "4", "-r", "1000", "-layout", ipath, "-"],
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
if res.returncode == 0:
|
||||||
|
lns = [
|
||||||
|
l.strip()
|
||||||
|
for l in res.stdout.decode().splitlines()
|
||||||
|
if "/L" in l and not " of " in l and not " sample" in l
|
||||||
|
]
|
||||||
|
chemicals = [
|
||||||
|
s
|
||||||
|
for x in lns
|
||||||
|
if len(s := re.split("\s\s+", x)) > 2 and "Total Organic Carbon" not in s[0]
|
||||||
|
]
|
||||||
|
return [parse_chemical(year, c) for c in chemicals]
|
||||||
|
else:
|
||||||
|
assert False, res.stderr
|
||||||
|
|
||||||
|
|
||||||
|
def main(year: int, ipath: str, opath: str) -> None:
|
||||||
|
rows = parse_pdf(year, ipath)
|
||||||
|
with open(opath, "w") as f:
|
||||||
|
w = csv.writer(f, delimiter="\t")
|
||||||
|
for r in rows:
|
||||||
|
w.writerow(r)
|
||||||
|
|
||||||
|
|
||||||
|
main(snakemake.wildcards["year"], snakemake.input[0], snakemake.output[0])
|
Loading…
Reference in New Issue