From 0578a0ed897cd3b5b18d4cf81dd95e652b3e3f01 Mon Sep 17 00:00:00 2001 From: ndwarshuis Date: Wed, 5 Apr 2023 21:00:46 -0400 Subject: [PATCH] ADD lots of current wssc analysis --- config/config.yml | 1 + .../2023-04-05T200327.912016.snakemake.log | 14 ++ .../2023-04-05T200330.952218.snakemake.log | 3 + workflow/Snakefile | 222 +++++++++++++++--- workflow/scripts/analyze_wssc.R | 54 +++++ workflow/scripts/concat_ucmr.R | 6 + workflow/scripts/standardize_ucmr_1.R | 31 +++ ...rdize_ucmr_34.R => standardize_ucmr_234.R} | 15 +- workflow/scripts/standardize_wqa.R | 107 +++++++++ workflow/scripts/summarize_ucmr.R | 32 +++ workflow/scripts/summarize_wqa.R | 120 ++++++++++ workflow/scripts/wssc_to_table.py | 89 +++++++ 12 files changed, 654 insertions(+), 40 deletions(-) create mode 100644 workflow/.snakemake/log/2023-04-05T200327.912016.snakemake.log create mode 100644 workflow/.snakemake/log/2023-04-05T200330.952218.snakemake.log create mode 100644 workflow/scripts/analyze_wssc.R create mode 100644 workflow/scripts/concat_ucmr.R create mode 100644 workflow/scripts/standardize_ucmr_1.R rename workflow/scripts/{standardize_ucmr_34.R => standardize_ucmr_234.R} (50%) create mode 100644 workflow/scripts/standardize_wqa.R create mode 100644 workflow/scripts/summarize_ucmr.R create mode 100644 workflow/scripts/summarize_wqa.R create mode 100644 workflow/scripts/wssc_to_table.py diff --git a/config/config.yml b/config/config.yml index 19c4d21..7226111 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1 +1,2 @@ my_pswid: MD0150005 +my_plant_id: "0100000" diff --git a/workflow/.snakemake/log/2023-04-05T200327.912016.snakemake.log b/workflow/.snakemake/log/2023-04-05T200327.912016.snakemake.log new file mode 100644 index 0000000..225a62a --- /dev/null +++ b/workflow/.snakemake/log/2023-04-05T200327.912016.snakemake.log @@ -0,0 +1,14 @@ +Full Traceback (most recent call last): + File "/home/ndwar/.local/share/conda/envs/moco-water/lib/python3.11/site-packages/snakemake/__init__.py", line 643, in snakemake + workflow.include( + File "/home/ndwar/.local/share/conda/envs/moco-water/lib/python3.11/site-packages/snakemake/workflow.py", line 1238, in include + exec(compile(code, snakefile.get_path_or_uri(), "exec"), self.globals) + File "/mnt/data/Dvl/env/home/water/workflow/Snakefile", line 7, in + configfile: "config/config.yml" + File "/home/ndwar/.local/share/conda/envs/moco-water/lib/python3.11/site-packages/snakemake/workflow.py", line 1311, in configfile + raise WorkflowError( +snakemake.exceptions.WorkflowError: Workflow defines configfile config/config.yml but it is not present or accessible (full checked path: /mnt/data/Dvl/env/home/water/workflow/config/config.yml). + +WorkflowError in file /mnt/data/Dvl/env/home/water/workflow/Snakefile, line 7: +Workflow defines configfile config/config.yml but it is not present or accessible (full checked path: /mnt/data/Dvl/env/home/water/workflow/config/config.yml). + File "/mnt/data/Dvl/env/home/water/workflow/Snakefile", line 7, in diff --git a/workflow/.snakemake/log/2023-04-05T200330.952218.snakemake.log b/workflow/.snakemake/log/2023-04-05T200330.952218.snakemake.log new file mode 100644 index 0000000..4983c0e --- /dev/null +++ b/workflow/.snakemake/log/2023-04-05T200330.952218.snakemake.log @@ -0,0 +1,3 @@ +WorkflowError in file /mnt/data/Dvl/env/home/water/workflow/Snakefile, line 7: +Workflow defines configfile config/config.yml but it is not present or accessible (full checked path: /mnt/data/Dvl/env/home/water/workflow/config/config.yml). + File "/mnt/data/Dvl/env/home/water/workflow/Snakefile", line 7, in diff --git a/workflow/Snakefile b/workflow/Snakefile index d33cc65..61a04bc 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -4,26 +4,27 @@ from snakemake.utils import min_version min_version("7.20") + configfile: "config/config.yml" ucmr_data = { - 1: ( - "https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip", - "ucmr1_list1and2chem_final.xls", - ), - 2: ( - "https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip", - "UCMR2_All_OccurrenceData_Jan12.txt", - ), - 3: ( - "https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip", - "UCMR3_All.txt", - ), - 4: ( - "https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip", - "UCMR4_All.txt", - ), + 2: "https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip", + 3: "https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip", + 4: "https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip", +} + +water_reports = { + 2022: "https://www.wsscwater.com/sites/default/files/2023-03/2022%20POT%20%26%20PAX%20Tap%20Report.pdf", + 2021: "https://www.wsscwater.com/sites/default/files/2022-07/2021%20POT%20%26%20PAX%20Tap%20Report.pdf", + 2020: "https://www.wsscwater.com/sites/default/files/2021-04/2020%20POT%20%26%20PAX%20Tap%20Report.pdf", + 2019: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2019%20POT%20%26%20PAX%20Tap%20Report.pdf", + 2018: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2018%20POT%20%26%20PAX%20Tap%20Report.pdf", + 2017: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2017%20POT%20%26%20PAX%20Tap_Report.pdf", + 2016: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2016%20PAXPOT%20%20WQR.pdf", + 2015: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2015%20POT%20%26%20PAX%20WQR%20Final%20050516.pdf", + 2014: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2014%20POT%20%20PAX%20WQR%20Draft%20022715%20Corrected%20030915.pdf", + 2013: "https://www.wsscwater.com/files/live/sites/wssc/files/PDFs/TapAnalysis2013_27546.pdf", } @@ -31,16 +32,16 @@ rule download_ucmr: output: "resources/ucmr/{ucmr}.zip", params: - url=lambda w: ucmr_data[int(w.ucmr)][0], + url=lambda w: ucmr_data[int(w.ucmr)], shell: "curl -sS -L -o {output} {params.url}" -rule unzip_ucmr_1: +rule unzip_ucmr_2: input: - expand(rules.download_ucmr.output, ucmr=1), + expand(rules.download_ucmr.output, ucmr=2), output: - "results/ucmr/ucmr_unzipped_1/ucmr1_list1and2chem_final.xls", + "results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt", params: zipdest=lambda _, output: dirname(output[0]), shell: @@ -51,27 +52,21 @@ rule unzip_ucmr_1: """ -use rule unzip_ucmr_1 as unzip_ucmr_2 with: - input: - expand(rules.download_ucmr.output, ucmr=2), - output: - "results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt", - - -use rule unzip_ucmr_1 as unzip_ucmr_3 with: +use rule unzip_ucmr_2 as unzip_ucmr_3 with: input: expand(rules.download_ucmr.output, ucmr=3), output: "results/ucmr/ucmr_unzipped_3/UCMR3_All.txt", -use rule unzip_ucmr_1 as unzip_ucmr_4 with: +use rule unzip_ucmr_2 as unzip_ucmr_4 with: input: expand(rules.download_ucmr.output, ucmr=4), output: "results/ucmr/ucmr_unzipped_4/UCMR4_All.txt", +# they used a real micro symbol instead of "u", which makes R choke rule fix_ucmr4_data_tbl: input: rules.unzip_ucmr_4.output, @@ -83,15 +78,38 @@ rule fix_ucmr4_data_tbl: """ -rule standardize_ucmr_3: +# manually make these data files +# 1) download zip from here: https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip +# 2) open in localc +# 3) save each of the 'DPCache' tables to tsv files (there should be three) +rule standardize_ucmr_1: input: - rules.unzip_ucmr_3.output, + expand("resources/ucmr/ucmr1/ucmr1_list1and2chem_final_{i}.tsv", i=[1, 2, 3]), output: - "results/ucmr_data/all_std_3.txv.gz", + "results/ucmr_data/all_std_1.txv.gz", conda: "envs/tidyverse.yml" script: - "scripts/standardize_ucmr_34.R" + "scripts/standardize_ucmr_1.R" + + +rule standardize_ucmr_2: + input: + rules.unzip_ucmr_2.output, + output: + "results/ucmr_data/all_std_2.txv.gz", + conda: + "envs/tidyverse.yml" + script: + "scripts/standardize_ucmr_234.R" + + +use rule standardize_ucmr_2 as standardize_ucmr_3 with: + input: + rules.fix_ucmr4_data_tbl.output, + output: + "results/ucmr_data/all_std_4.txv.gz", + use rule standardize_ucmr_3 as standardize_ucmr_4 with: input: @@ -100,10 +118,140 @@ use rule standardize_ucmr_3 as standardize_ucmr_4 with: "results/ucmr_data/all_std_4.txv.gz", +rule concat_ucmr: + input: + rules.standardize_ucmr_1.output, + rules.standardize_ucmr_2.output, + rules.standardize_ucmr_3.output, + rules.standardize_ucmr_4.output, + output: + "results/ucmr_data/all_std.txv.gz", + conda: + "envs/tidyverse.yml" + script: + "scripts/concat_ucmr.R" + + +rule summarize_ucmr: + input: + rules.concat_ucmr.output, + output: + tap="results/ucmr_plots/tap.pdf", + plant="results/ucmr_plots/plant.pdf", + conda: + "envs/tidyverse.yml" + script: + "scripts/summarize_ucmr.R" + + +rule download_wqa_results: + output: + "resources/wqa/results.zip", + shell: + """ + curl -Ss -q -X POST --header 'Content-Type: application/json' \ + --header 'Accept: application/zip' \ + -d '{{"countrycode":["US"], + "statecode":["US:24"], + "countycode":["US:24:031"], + "within":"20", + "lat":"39.109", + "long":"-77.2489", + "dataProfile":"resultPhysChem", + "providers":["NWIS","STEWARDS","STORET"] + }}' \ + 'https://www.waterqualitydata.us/data/Result/search?mimeType=tsv&zip=yes' \ + > {output} + """ + + +rule download_wqa_station: + output: + "resources/wqa/station.zip", + shell: + """ + curl -Ss -q -X POST --header 'Content-Type: application/json' \ + --header 'Accept: application/zip' \ + -d '{{"countrycode":["US"], + "statecode":["US:24"], + "countycode":["US:24:031"], + "within":"20", + "lat":"39.109", + "long":"-77.2489", + "providers":["NWIS","STEWARDS","STORET"] + }}' \ + 'https://www.waterqualitydata.us/data/Station/search?mimeType=tsv&zip=yes' \ + > {output} + """ + + +use rule unzip_ucmr_2 as unzip_wqa_results with: + input: + rules.download_wqa_results.output, + output: + "results/wqa/src/results/resultphyschem.tsv", + + +use rule unzip_ucmr_2 as unzip_wqa_station with: + input: + rules.download_wqa_station.output, + output: + "results/wqa/src/station/station.tsv", + + +rule standardize_wqa: + input: + station=rules.unzip_wqa_station.output, + results=rules.unzip_wqa_results.output, + output: + "results/wqa/process/all.tsv.gz", + conda: + "envs/tidyverse.yml" + script: + "scripts/standardize_wqa.R" + + +rule download_water_report: + output: + "resources/wssc/{year}.pdf", + params: + url=lambda w: water_reports[int(w.year)], + shell: + "curl -sS -L -o {output} {params.url}" + + +rule parse_water_report: + input: + rules.download_water_report.output, + output: + "results/wssc/{year}.tsv", + script: + "scripts/wssc_to_table.py" + + +rule cat_reports: + input: + expand(rules.parse_water_report.output, year=water_reports), + output: + "results/wssc/all.tsv", + shell: + "cat {input} > {output}" + + +rule analyse_reports: + input: + rules.cat_reports.output, + output: + limit="results/wssc/binned_limit.tsv", + nolimit="results/wssc/detected_nolimit.tsv", + conda: + "envs/tidyverse.yml" + script: + "scripts/analyze_wssc.R" + rule all: input: - rules.standardize_ucmr_3.output, - rules.standardize_ucmr_4.output, - # expand(rules.unzip_ucmr.output, ucmr=[1, 2, 3]), - # rules.fix_ucmr4_data_tbl.output, + rules.summarize_ucmr.output, + rules.standardize_wqa.output, + rules.analyse_reports.output, diff --git a/workflow/scripts/analyze_wssc.R b/workflow/scripts/analyze_wssc.R new file mode 100644 index 0000000..777e936 --- /dev/null +++ b/workflow/scripts/analyze_wssc.R @@ -0,0 +1,54 @@ +library(tidyverse) + +split_df <- function(df, test) { + list(i = filter(df, {{ test }}), o = filter(df, !{{ test }})) +} + +path <- snakemake@input[[1]] + +df <- readr::read_tsv( + path, + col_types = "iccddddddd", + col_names = + c("year", + "species", + "unit", + "ave_lower", + "ave_upper", + "min_lower", + "min_upper", + "max_lower", + "max_upper", + "limit" + ) +) %>% + # there are some TTHM/HHA5 entries in here twice, use the ones with limits + filter(!str_detect(species, "(TTHM|HAA5)") | limit > 0) + +has_limit <- df %>% + group_by(species) %>% + summarize(limit = max(limit)) %>% + filter(limit > 0) %>% + pull(species) + +limited <- split_df(df, species %in% has_limit) + +binned_limit <- limited$i %>% + group_by(species) %>% + summarize(av = max(ave_upper), mx = max(max_upper), limit = max(limit), .groups = "drop") %>% + mutate(bin = case_when(mx == 0 ~ "undetected", + mx > limit ~ "over", + mx > limit / 10 ~ "over10", + mx > limit / 100 ~ "over100", + TRUE ~ "safeIGuess")) %>% + filter(bin != "undetected") %>% + arrange(bin, species) %>% + readr::write_tsv(snakemake@output[["limit"]]) + +detected_nolimit <- limited$o %>% + group_by(species) %>% + summarize(av = max(ave_upper), mx = max(max_upper)) %>% + mutate(detected = mx > 0) %>% + filter(detected) %>% + arrange(species) %>% + readr::write_tsv(snakemake@output[["nolimit"]]) diff --git a/workflow/scripts/concat_ucmr.R b/workflow/scripts/concat_ucmr.R new file mode 100644 index 0000000..05bbf24 --- /dev/null +++ b/workflow/scripts/concat_ucmr.R @@ -0,0 +1,6 @@ +library(tidyverse) + +snakemake@input %>% + map_dfr(~ readr::read_tsv(.x, col_types = "c")) %>% + replace_na(list(value = 0)) %>% + readr::write_tsv(snakemake@output[[1]]) diff --git a/workflow/scripts/standardize_ucmr_1.R b/workflow/scripts/standardize_ucmr_1.R new file mode 100644 index 0000000..7458643 --- /dev/null +++ b/workflow/scripts/standardize_ucmr_1.R @@ -0,0 +1,31 @@ +library(tidyverse) + +read_tsv <- function(path) { + readr::read_tsv( + path, + col_types = cols( + PWSID = "c", + PWSName = "c", + "Facility ID" = "c", + "Sample point ID" = "c", + "Sample point type" = "c", + "Sample collection date" = "c", + Contaminant = "c", + "Unit measure" = "c", + Result = "d", + .default = "-" + )) +} + +df <- snakemake@input %>% + map_dfr(read_tsv) %>% + rename(sample_point_ID = "Sample point ID", + sample_point_type = "Sample point type", + facility_ID = "Facility ID", + date = "Sample collection date", + species = Contaminant, + unit = "Unit measure", + value = Result) %>% + mutate(date = as.Date(date, format = "%m/%d/%y")) %>% + readr::write_tsv(snakemake@output[[1]]) + diff --git a/workflow/scripts/standardize_ucmr_34.R b/workflow/scripts/standardize_ucmr_234.R similarity index 50% rename from workflow/scripts/standardize_ucmr_34.R rename to workflow/scripts/standardize_ucmr_234.R index 90f060e..a98bca8 100644 --- a/workflow/scripts/standardize_ucmr_34.R +++ b/workflow/scripts/standardize_ucmr_234.R @@ -5,6 +5,7 @@ snakemake@input[[1]] %>% col_types = cols( PWSID = "c", PWSName = "c", + FacilityID = "c", FacilityName = "c", SamplePointID = "c", SamplePointName = "c", @@ -15,7 +16,15 @@ snakemake@input[[1]] %>% AnalyticalResultValue = "d", .default = "-" )) %>% - mutate(CollectionDate = as.Date(CollectionDate)) %>% - rename(value = "AnalyticalResultValue") %>% - filter(PWSID == snakemake@config[["my_pswid"]]) %>% + rename( + sample_point_ID = SamplePointID, + sample_point_name = SamplePointName, + sample_point_type = SamplePointType, + facility_ID = FacilityID, + facility_Name = FacilityName, + date = CollectionDate, + species = Contaminant, + unit = AnalyticalResultsSign, + value = AnalyticalResultValue) %>% + mutate(date = as.Date(date, format = "%m/%d/%Y")) %>% readr::write_tsv(snakemake@output[[1]]) diff --git a/workflow/scripts/standardize_wqa.R b/workflow/scripts/standardize_wqa.R new file mode 100644 index 0000000..f05d148 --- /dev/null +++ b/workflow/scripts/standardize_wqa.R @@ -0,0 +1,107 @@ +library(tidyverse) + + +site_df <- readr::read_tsv( + snakemake@input[["station"]], + col_types = cols( + MonitoringLocationIdentifier = "f", + MonitoringLocationTypeName = "f", + MonitoringLocationName = "f", + MonitoringLocationDescriptionText = "c", + LatitudeMeasure = "d", + LongitudeMeasure = "d", + .default = "-") +) %>% + rename( + location = MonitoringLocationIdentifier, + location_type = MonitoringLocationTypeName, + location_name = MonitoringLocationName, + location_desc = MonitoringLocationDescriptionText, + lat = LatitudeMeasure, + long = LongitudeMeasure, + ) + +# This has a bunch of crap in it that has nothing to do with chemicals in +# water (which might make amphibians gay). Additionally, there are many +# different units that need to be standardized (eventually to be put in +# terms of ug/ml) +result_df <- readr::read_tsv( + snakemake@input[["results"]], + col_types = cols( + ActivityTypeCode = "f", + ActivityStartDate = "D", + ActivityEndDate = "D", + ActivityMediaName = "f", + MonitoringLocationIdentifier = "f", + CharacteristicName = "f", + ResultMeasureValue = "c", + "ResultMeasure/MeasureUnitCode" = "c", + "DetectionQuantitationLimitMeasure/MeasureValue" = "c", + "DetectionQuantitationLimitMeasure/MeasureUnitCode" = "f", + .default = "-", + ) +) %>% + rename(activity = ActivityTypeCode, + unit = "ResultMeasure/MeasureUnitCode", + media = ActivityMediaName, + start = ActivityStartDate, + end = ActivityEndDate, + location = MonitoringLocationIdentifier, + value = ResultMeasureValue, + limit = "DetectionQuantitationLimitMeasure/MeasureValue", + limit_unit = "DetectionQuantitationLimitMeasure/MeasureUnitCode", + species = CharacteristicName) %>% + arrange(start) %>% + filter(media == "Water") %>% + select(-media) %>% + # select values that are numbers (some are just descriptive strings) assuming + # blanks are actually zeros (to be filtered out later) + replace_na(list(value = "0")) %>% + filter(str_detect(value, "^-?\\d+\\.?\\d*$")) %>% + mutate(value = as.numeric(value)) %>% + # select units that are mass concentrations or "counts per something" + filter(str_detect(unit, "^.*g/.*(l|L|g)$") | + unit %in% c("%", "ppb", "ppm")) %>% + # remove a bunch of crap with "%" units + filter(! str_detect(species, "SSC|Cloud cover|Sediment|solids|demand")) %>% + filter(! str_detect(species, "Pha?eophytin|Chlorophyll|Alkalinity")) %>% + filter(species != "Sodium, percent total cations" # not sure what this means + & species != "Dissolved oxygen saturation" # whatever + & species != "Water" # ironic... + & species != "Barometric pressure" # not a chemical + & species != "Relative humidity" # not a chemical either + & species != "Extract volume" # '' + & species != "Volume, total" # '' + & species != "Acidity, (H+)" # will change later + & species != "Carbon dioxide" # ditto + & species != "Dissolved oxygen (DO)" # ditto + & species != "Total hardness" # not specific + ) %>% + # these seems like a typos + mutate(species = case_when( + species == "Diazinon0" ~ "Diazinon", + species == "Phosphate-phosphorus***retired***use Total Phosphorus, mixed forms" ~ "Total Phosphorus, mixed forms", + species == "Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite" ~ "Nitrate + Nitrite", + TRUE ~ species + )) %>% + # collapse units to (f/n/u/m)g/L + mutate(unit = str_replace(unit, "/l", "/L"), + unit = str_replace(unit, "kg", "L"), + unit = case_when(unit == "ppb" ~ "ug/L", + unit == "ppm" ~ "mg/L", + TRUE ~ unit), + # percent will just be in mg/L + value = if_else(unit == "%", value * 10, value), + unit = if_else(unit == "%", "mg/L", unit)) %>% + # standardize all values to ug/L + mutate(std_value = case_when(unit == "g/L" ~ value * 1e6, + unit == "mg/L" ~ value * 1e3, + unit == "ng/L" ~ value / 1e3, + unit == "fg/L" ~ value / 1e6, + TRUE ~ value)) %>% + select(-value, -unit) + +result_df %>% + left_join(site_df, by = "location") %>% + select(-location) %>% + readr::write_tsv(snakemake@output[[1]]) diff --git a/workflow/scripts/summarize_ucmr.R b/workflow/scripts/summarize_ucmr.R new file mode 100644 index 0000000..bd34324 --- /dev/null +++ b/workflow/scripts/summarize_ucmr.R @@ -0,0 +1,32 @@ +library(tidyverse) + +df <- readr::read_tsv( + snakemake@input[[1]], + col_types = cols(date = "D", value = "d", .default = "c") +) %>% + filter(PWSID == snakemake@config[["my_pswid"]]) %>% + select(-PWSID, -PWSName) + +# purification plant sample point +df %>% + filter(sample_point_ID == snakemake@config[["my_plant_id"]]) %>% + filter(sample_point_type == "EP") %>% + ggplot(aes(value, fct_rev(species))) + + geom_jitter() + + scale_x_log10() + + labs(x = "level (ug/ml)", + y = "species", + title = "Purification Plant") +ggsave(snakemake@output[["plant"]]) + +# tap sample points +df %>% + filter(sample_point_type %in% c("DS", "MR")) %>% + ggplot(aes(value, fct_rev(species))) + + geom_jitter() + + scale_x_log10() + + labs(x = "level (ug/ml)", + y = "species", + title = "Purification Plant") +ggsave(snakemake@output[["tap"]]) + diff --git a/workflow/scripts/summarize_wqa.R b/workflow/scripts/summarize_wqa.R new file mode 100644 index 0000000..fde2c64 --- /dev/null +++ b/workflow/scripts/summarize_wqa.R @@ -0,0 +1,120 @@ +library(tidyverse) + +split_df <- function(df, flt) { + .in <- df %>% + filter({{ flt }}) + .out <- df %>% + filter(! {{ flt }}) + list(i = .in, + o = .out) +} + +df <- readr::read_tsv( + "../../results/wqa/process/all.tsv.gz", + col_types = cols( + start = "D", + species = "f", + std_value = "d", + lat = "d", + long = "d", + location_name = "c", + .default = "-" + ) +) %>% + # get rid of the deuterium distinction on some pharma species + mutate(species = str_replace(species, "-(d|D)\\d", "")) + +not_detected <- df %>% + group_by(species) %>% + summarize(total = sum(std_value)) %>% + filter(total == 0) %>% + pull(species) + + +harmless <- c( + "Sodium", + "Bicarbonate", + "Calcium", + "Magnesium", + "Potassium", + "Carbonate", + "Oxygen", + "Silica" +) + +df_detected <- df %>% + filter(! species %in% not_detected) %>% + filter(! species %in% harmless) + +df %>% + filter(lubridate::year(start) > 1990) %>% + group_by(species) %>% + summarize(fraction = mean(std_value > 0), + n = n()) %>% + mutate(stderr = sqrt(fraction * (1 - fraction) / n)) %>% + filter(n > 3) %>% + ggplot(aes(fraction, fct_reorder(species, fraction))) + + geom_col() + + geom_errorbarh(aes(xmin = fraction - stderr, xmax = fraction + stderr)) + +metals <- c( + "Lithium", + "Beryllium", + "Boron", + "Aluminum", + "Vanadium", + "Chromium", + "Manganese", + "Iron", + "Cobalt", + "Nickel", + "Copper", + "Zinc", + "Arsenic", + "Selenium", + "Strontium", + "Molybdenum", + "Silver", + "Cadmium", + "Antimony", + "Barium", + "Mercury", + "Thallium", + "Lead", + "Uranium" +) + +halides <- c("Chloride", "Fluoride", "Bromide") + +.nitro <- split_df(df_detected, str_detect(species, "(n|N)itr") + | str_detect(species, "Ammon")) + +.phospho <- split_df(.nitro$o, str_detect(species, "(P|p)hosph")) + +.metal <- split_df(.phospho$o, species %in% metals) + +.halides <- split_df(.metal$o, species %in% halides) + +.nitro$i %>% + ggplot(aes(start, std_value, color = species, group = species)) + + geom_line() + +.halides$i %>% + ggplot(aes(start, std_value)) + + geom_point() + + facet_wrap(scales = "free", c("species")) + +.metal$i %>% + ggplot(aes(start, std_value)) + + geom_point() + + facet_wrap(scales = "free", c("species")) + +.phospho$i %>% + ggplot(aes(start, std_value)) + + geom_point() + + facet_wrap(scales = "free", c("species")) + +.halides$o %>% + filter(std_value > 1) %>% + ggplot(aes(std_value, species)) + + geom_jitter() diff --git a/workflow/scripts/wssc_to_table.py b/workflow/scripts/wssc_to_table.py new file mode 100644 index 0000000..e7b28c5 --- /dev/null +++ b/workflow/scripts/wssc_to_table.py @@ -0,0 +1,89 @@ +import csv +import sys +import re +import subprocess as sp +from datetime import datetime +from pathlib import Path +from typing import NamedTuple + + +class CsvRow(NamedTuple): + year: int + species: str + unit: str + average_lower: float + average_upper: float + min_lower: float + min_upper: float + max_lower: float + max_upper: float + limit: float + + +def fmt_float(x: str) -> float: + if x == "n/d" or x == "n.d": + return 0 + else: + # spaces sometimes show up if there is a superscript + return float(x.split(" ")[0]) + + +def fmt_lt(x: str) -> tuple[float, float]: + if "<" == x[0]: + return (0, fmt_float(x.removeprefix("<"))) + else: + return (y := fmt_float(x), y) + + +def parse_chemical(year, line: list[str]) -> CsvRow: + try: + limit = float(re.match("\d+(\.\d+)?", line[5])[0]) + except (TypeError, ValueError, IndexError): + limit = -1 + a = fmt_lt(line[2]) + mx = fmt_lt(line[3]) + mi = fmt_lt(line[4]) + return CsvRow( + year=year, + species=line[0], + unit=line[1].replace("ยต", "u"), + average_lower=a[0], + average_upper=a[1], + min_lower=mi[0], + min_upper=mi[1], + max_lower=mx[0], + max_upper=mx[1], + limit=limit, + ) + + +def parse_pdf(year: int, ipath: Path) -> list[CsvRow]: + res = sp.run( + ["pdftotext", "-f", "1", "-l", "4", "-r", "1000", "-layout", ipath, "-"], + capture_output=True, + ) + if res.returncode == 0: + lns = [ + l.strip() + for l in res.stdout.decode().splitlines() + if "/L" in l and not " of " in l and not " sample" in l + ] + chemicals = [ + s + for x in lns + if len(s := re.split("\s\s+", x)) > 2 and "Total Organic Carbon" not in s[0] + ] + return [parse_chemical(year, c) for c in chemicals] + else: + assert False, res.stderr + + +def main(year: int, ipath: str, opath: str) -> None: + rows = parse_pdf(year, ipath) + with open(opath, "w") as f: + w = csv.writer(f, delimiter="\t") + for r in rows: + w.writerow(r) + + +main(snakemake.wildcards["year"], snakemake.input[0], snakemake.output[0])