ADD lots of current wssc analysis

2023-04-05 21:00:46 -04:00 · 2023-04-05 21:00:46 -04:00 · 0578a0ed89
parent 3f68fbbbe6
commit 0578a0ed89
12 changed files with 654 additions and 40 deletions
--- a/config/config.yml
+++ b/config/config.yml
@ -1 +1,2 @@
 my_pswid: MD0150005
+my_plant_id: "0100000"
--- a/workflow/.snakemake/log/2023-04-05T200327.912016.snakemake.log
+++ b/workflow/.snakemake/log/2023-04-05T200327.912016.snakemake.log
@ -0,0 +1,14 @@
+Full Traceback (most recent call last):
+  File "/home/ndwar/.local/share/conda/envs/moco-water/lib/python3.11/site-packages/snakemake/__init__.py", line 643, in snakemake
+    workflow.include(
+  File "/home/ndwar/.local/share/conda/envs/moco-water/lib/python3.11/site-packages/snakemake/workflow.py", line 1238, in include
+    exec(compile(code, snakefile.get_path_or_uri(), "exec"), self.globals)
+  File "/mnt/data/Dvl/env/home/water/workflow/Snakefile", line 7, in <module>
+    configfile: "config/config.yml"
+  File "/home/ndwar/.local/share/conda/envs/moco-water/lib/python3.11/site-packages/snakemake/workflow.py", line 1311, in configfile
+    raise WorkflowError(
+snakemake.exceptions.WorkflowError: Workflow defines configfile config/config.yml but it is not present or accessible (full checked path: /mnt/data/Dvl/env/home/water/workflow/config/config.yml).
+
+WorkflowError in file /mnt/data/Dvl/env/home/water/workflow/Snakefile, line 7:
+Workflow defines configfile config/config.yml but it is not present or accessible (full checked path: /mnt/data/Dvl/env/home/water/workflow/config/config.yml).
+  File "/mnt/data/Dvl/env/home/water/workflow/Snakefile", line 7, in <module>
--- a/workflow/.snakemake/log/2023-04-05T200330.952218.snakemake.log
+++ b/workflow/.snakemake/log/2023-04-05T200330.952218.snakemake.log
@ -0,0 +1,3 @@
+WorkflowError in file /mnt/data/Dvl/env/home/water/workflow/Snakefile, line 7:
+Workflow defines configfile config/config.yml but it is not present or accessible (full checked path: /mnt/data/Dvl/env/home/water/workflow/config/config.yml).
+  File "/mnt/data/Dvl/env/home/water/workflow/Snakefile", line 7, in <module>
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@ -4,26 +4,27 @@ from snakemake.utils import min_version

 min_version("7.20")

+
 configfile: "config/config.yml"


 ucmr_data = {
-    1: (
-        "https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip",
-        "ucmr1_list1and2chem_final.xls",
-    ),
-    2: (
-        "https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip",
-        "UCMR2_All_OccurrenceData_Jan12.txt",
-    ),
-    3: (
-        "https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip",
-        "UCMR3_All.txt",
-    ),
-    4: (
-        "https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip",
-        "UCMR4_All.txt",
-    ),
+    2: "https://www.epa.gov/sites/default/files/2015-09/ucmr2_occurrencedata_jan12.zip",
+    3: "https://www.epa.gov/sites/default/files/2017-02/ucmr-3-occurrence-data.zip",
+    4: "https://www.epa.gov/sites/default/files/2020-04/ucmr_4_occurrence_data.zip",
+}
+
+water_reports = {
+    2022: "https://www.wsscwater.com/sites/default/files/2023-03/2022%20POT%20%26%20PAX%20Tap%20Report.pdf",
+    2021: "https://www.wsscwater.com/sites/default/files/2022-07/2021%20POT%20%26%20PAX%20Tap%20Report.pdf",
+    2020: "https://www.wsscwater.com/sites/default/files/2021-04/2020%20POT%20%26%20PAX%20Tap%20Report.pdf",
+    2019: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2019%20POT%20%26%20PAX%20Tap%20Report.pdf",
+    2018: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2018%20POT%20%26%20PAX%20Tap%20Report.pdf",
+    2017: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2017%20POT%20%26%20PAX%20Tap_Report.pdf",
+    2016: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2016%20PAXPOT%20%20WQR.pdf",
+    2015: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2015%20POT%20%26%20PAX%20WQR%20Final%20050516.pdf",
+    2014: "https://www.wsscwater.com/files/live/sites/wssc/files/tap%20water/2014%20POT%20%20PAX%20WQR%20Draft%20022715%20Corrected%20030915.pdf",
+    2013: "https://www.wsscwater.com/files/live/sites/wssc/files/PDFs/TapAnalysis2013_27546.pdf",
 }


@ -31,16 +32,16 @@ rule download_ucmr:
    output:
        "resources/ucmr/{ucmr}.zip",
    params:
-        url=lambda w: ucmr_data[int(w.ucmr)][0],
+        url=lambda w: ucmr_data[int(w.ucmr)],
    shell:
        "curl -sS -L -o {output} {params.url}"


-rule unzip_ucmr_1:
+rule unzip_ucmr_2:
    input:
-        expand(rules.download_ucmr.output, ucmr=1),
+        expand(rules.download_ucmr.output, ucmr=2),
    output:
-        "results/ucmr/ucmr_unzipped_1/ucmr1_list1and2chem_final.xls",
+        "results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt",
    params:
        zipdest=lambda _, output: dirname(output[0]),
    shell:
@ -51,27 +52,21 @@ rule unzip_ucmr_1:
        """


-use rule unzip_ucmr_1 as unzip_ucmr_2 with:
-    input:
-        expand(rules.download_ucmr.output, ucmr=2),
-    output:
-        "results/ucmr/ucmr_unzipped_2/UCMR2_All_OccurrenceData_Jan12.txt",
-
-
-use rule unzip_ucmr_1 as unzip_ucmr_3 with:
+use rule unzip_ucmr_2 as unzip_ucmr_3 with:
    input:
        expand(rules.download_ucmr.output, ucmr=3),
    output:
        "results/ucmr/ucmr_unzipped_3/UCMR3_All.txt",


-use rule unzip_ucmr_1 as unzip_ucmr_4 with:
+use rule unzip_ucmr_2 as unzip_ucmr_4 with:
    input:
        expand(rules.download_ucmr.output, ucmr=4),
    output:
        "results/ucmr/ucmr_unzipped_4/UCMR4_All.txt",


+# they used a real micro symbol instead of "u", which makes R choke
 rule fix_ucmr4_data_tbl:
    input:
        rules.unzip_ucmr_4.output,
@ -83,15 +78,38 @@ rule fix_ucmr4_data_tbl:
        """


-rule standardize_ucmr_3:
+# manually make these data files
+# 1) download zip from here: https://www.epa.gov/sites/default/files/2015-09/ucmr1_list1and2chem.zip
+# 2) open in localc
+# 3) save each of the 'DPCache' tables to tsv files (there should be three)
+rule standardize_ucmr_1:
    input:
-        rules.unzip_ucmr_3.output,
+        expand("resources/ucmr/ucmr1/ucmr1_list1and2chem_final_{i}.tsv", i=[1, 2, 3]),
    output:
-        "results/ucmr_data/all_std_3.txv.gz",
+        "results/ucmr_data/all_std_1.txv.gz",
    conda:
        "envs/tidyverse.yml"
    script:
-        "scripts/standardize_ucmr_34.R"
+        "scripts/standardize_ucmr_1.R"
+
+
+rule standardize_ucmr_2:
+    input:
+        rules.unzip_ucmr_2.output,
+    output:
+        "results/ucmr_data/all_std_2.txv.gz",
+    conda:
+        "envs/tidyverse.yml"
+    script:
+        "scripts/standardize_ucmr_234.R"
+
+
+use rule standardize_ucmr_2 as standardize_ucmr_3 with:
+    input:
+        rules.fix_ucmr4_data_tbl.output,
+    output:
+        "results/ucmr_data/all_std_4.txv.gz",
+

 use rule standardize_ucmr_3 as standardize_ucmr_4 with:
    input:
@ -100,10 +118,140 @@ use rule standardize_ucmr_3 as standardize_ucmr_4 with:
        "results/ucmr_data/all_std_4.txv.gz",


+rule concat_ucmr:
+    input:
+        rules.standardize_ucmr_1.output,
+        rules.standardize_ucmr_2.output,
+        rules.standardize_ucmr_3.output,
+        rules.standardize_ucmr_4.output,
+    output:
+        "results/ucmr_data/all_std.txv.gz",
+    conda:
+        "envs/tidyverse.yml"
+    script:
+        "scripts/concat_ucmr.R"
+
+
+rule summarize_ucmr:
+    input:
+        rules.concat_ucmr.output,
+    output:
+        tap="results/ucmr_plots/tap.pdf",
+        plant="results/ucmr_plots/plant.pdf",
+    conda:
+        "envs/tidyverse.yml"
+    script:
+        "scripts/summarize_ucmr.R"
+
+
+rule download_wqa_results:
+    output:
+        "resources/wqa/results.zip",
+    shell:
+        """
+        curl -Ss -q -X POST --header 'Content-Type: application/json' \
+        --header 'Accept: application/zip' \
+        -d '{{"countrycode":["US"],
+              "statecode":["US:24"],
+              "countycode":["US:24:031"],
+              "within":"20",
+              "lat":"39.109",
+              "long":"-77.2489",
+              "dataProfile":"resultPhysChem",
+              "providers":["NWIS","STEWARDS","STORET"]
+             }}' \
+        'https://www.waterqualitydata.us/data/Result/search?mimeType=tsv&zip=yes' \
+        > {output}
+        """
+
+
+rule download_wqa_station:
+    output:
+        "resources/wqa/station.zip",
+    shell:
+        """
+        curl -Ss -q -X POST --header 'Content-Type: application/json' \
+        --header 'Accept: application/zip' \
+        -d '{{"countrycode":["US"],
+              "statecode":["US:24"],
+              "countycode":["US:24:031"],
+              "within":"20",
+              "lat":"39.109",
+              "long":"-77.2489",
+              "providers":["NWIS","STEWARDS","STORET"]
+            }}' \
+        'https://www.waterqualitydata.us/data/Station/search?mimeType=tsv&zip=yes' \
+        > {output}
+        """
+
+
+use rule unzip_ucmr_2 as unzip_wqa_results with:
+    input:
+        rules.download_wqa_results.output,
+    output:
+        "results/wqa/src/results/resultphyschem.tsv",
+
+
+use rule unzip_ucmr_2 as unzip_wqa_station with:
+    input:
+        rules.download_wqa_station.output,
+    output:
+        "results/wqa/src/station/station.tsv",
+
+
+rule standardize_wqa:
+    input:
+        station=rules.unzip_wqa_station.output,
+        results=rules.unzip_wqa_results.output,
+    output:
+        "results/wqa/process/all.tsv.gz",
+    conda:
+        "envs/tidyverse.yml"
+    script:
+        "scripts/standardize_wqa.R"
+
+
+rule download_water_report:
+    output:
+        "resources/wssc/{year}.pdf",
+    params:
+        url=lambda w: water_reports[int(w.year)],
+    shell:
+        "curl -sS -L -o {output} {params.url}"
+
+
+rule parse_water_report:
+    input:
+        rules.download_water_report.output,
+    output:
+        "results/wssc/{year}.tsv",
+    script:
+        "scripts/wssc_to_table.py"
+
+
+rule cat_reports:
+    input:
+        expand(rules.parse_water_report.output, year=water_reports),
+    output:
+        "results/wssc/all.tsv",
+    shell:
+        "cat {input} > {output}"
+
+
+rule analyse_reports:
+    input:
+        rules.cat_reports.output,
+    output:
+        limit="results/wssc/binned_limit.tsv",
+        nolimit="results/wssc/detected_nolimit.tsv",
+    conda:
+        "envs/tidyverse.yml"
+    script:
+        "scripts/analyze_wssc.R"
+

 rule all:
    input:
-        rules.standardize_ucmr_3.output,
-        rules.standardize_ucmr_4.output,
-        # expand(rules.unzip_ucmr.output, ucmr=[1, 2, 3]),
-        # rules.fix_ucmr4_data_tbl.output,
+        rules.summarize_ucmr.output,
+        rules.standardize_wqa.output,
+        rules.analyse_reports.output,
--- a/workflow/scripts/analyze_wssc.R
+++ b/workflow/scripts/analyze_wssc.R
@ -0,0 +1,54 @@
+library(tidyverse)
+
+split_df <- function(df, test) {
+  list(i = filter(df, {{ test }}), o = filter(df, !{{ test }}))
+}
+
+path <- snakemake@input[[1]]
+
+df <- readr::read_tsv(
+  path,
+  col_types = "iccddddddd",
+  col_names =
+    c("year",
+      "species",
+      "unit",
+      "ave_lower",
+      "ave_upper",
+      "min_lower",
+      "min_upper",
+      "max_lower",
+      "max_upper",
+      "limit"
+      )
+) %>%
+  # there are some TTHM/HHA5 entries in here twice, use the ones with limits
+  filter(!str_detect(species, "(TTHM|HAA5)") | limit > 0)
+
+has_limit <- df %>%
+  group_by(species) %>%
+  summarize(limit = max(limit)) %>%
+  filter(limit > 0) %>%
+  pull(species)
+
+limited <- split_df(df, species %in% has_limit)
+
+binned_limit <- limited$i %>%
+  group_by(species) %>%
+  summarize(av = max(ave_upper), mx = max(max_upper), limit = max(limit), .groups = "drop") %>%
+  mutate(bin = case_when(mx == 0 ~ "undetected",
+                            mx > limit ~ "over",
+                            mx > limit / 10 ~ "over10",
+                            mx > limit / 100 ~ "over100",
+                            TRUE ~ "safeIGuess")) %>%
+  filter(bin != "undetected") %>%
+  arrange(bin, species) %>%
+  readr::write_tsv(snakemake@output[["limit"]])
+
+detected_nolimit <- limited$o %>%
+  group_by(species) %>%
+  summarize(av = max(ave_upper), mx = max(max_upper)) %>%
+  mutate(detected = mx > 0) %>%
+  filter(detected) %>%
+  arrange(species) %>%
+  readr::write_tsv(snakemake@output[["nolimit"]])
--- a/workflow/scripts/concat_ucmr.R
+++ b/workflow/scripts/concat_ucmr.R
@ -0,0 +1,6 @@
+library(tidyverse)
+
+snakemake@input %>%
+  map_dfr(~ readr::read_tsv(.x, col_types = "c")) %>%
+  replace_na(list(value = 0)) %>%
+  readr::write_tsv(snakemake@output[[1]])
--- a/workflow/scripts/standardize_ucmr_1.R
+++ b/workflow/scripts/standardize_ucmr_1.R
@ -0,0 +1,31 @@
+library(tidyverse)
+
+read_tsv <- function(path) {
+  readr::read_tsv(
+    path,
+    col_types = cols(
+      PWSID = "c",
+      PWSName = "c",
+      "Facility ID" = "c",
+      "Sample point ID" = "c",
+      "Sample point type" = "c",
+      "Sample collection date" = "c",
+      Contaminant = "c",
+      "Unit measure" = "c",
+      Result = "d",
+      .default = "-"
+    ))
+}
+
+df <- snakemake@input %>%
+  map_dfr(read_tsv) %>%
+  rename(sample_point_ID = "Sample point ID",
+         sample_point_type = "Sample point type",
+         facility_ID = "Facility ID",
+         date = "Sample collection date",
+         species = Contaminant,
+         unit = "Unit measure",
+         value = Result) %>%
+  mutate(date = as.Date(date, format = "%m/%d/%y")) %>%
+  readr::write_tsv(snakemake@output[[1]])
+         
--- a/workflow/scripts/standardize_ucmr_234.R
+++ b/workflow/scripts/standardize_ucmr_234.R
@ -5,6 +5,7 @@ snakemake@input[[1]] %>%
  col_types = cols(
    PWSID = "c",
    PWSName = "c",
+    FacilityID = "c",
    FacilityName = "c",
    SamplePointID = "c",
    SamplePointName = "c",
@ -15,7 +16,15 @@ snakemake@input[[1]] %>%
    AnalyticalResultValue = "d",
    .default = "-"
  )) %>%
-  mutate(CollectionDate = as.Date(CollectionDate)) %>%
-  rename(value = "AnalyticalResultValue") %>%
-  filter(PWSID == snakemake@config[["my_pswid"]]) %>%
+  rename(
+    sample_point_ID = SamplePointID,
+    sample_point_name = SamplePointName,
+    sample_point_type = SamplePointType,
+    facility_ID = FacilityID,
+    facility_Name = FacilityName,
+    date = CollectionDate,
+    species = Contaminant,
+    unit = AnalyticalResultsSign,
+    value = AnalyticalResultValue) %>%
+  mutate(date = as.Date(date, format = "%m/%d/%Y")) %>%
  readr::write_tsv(snakemake@output[[1]])
--- a/workflow/scripts/standardize_wqa.R
+++ b/workflow/scripts/standardize_wqa.R
@ -0,0 +1,107 @@
+library(tidyverse)
+
+
+site_df <- readr::read_tsv(
+  snakemake@input[["station"]],
+  col_types = cols(
+    MonitoringLocationIdentifier = "f",
+    MonitoringLocationTypeName = "f",
+    MonitoringLocationName = "f",
+    MonitoringLocationDescriptionText = "c",
+    LatitudeMeasure = "d",
+    LongitudeMeasure = "d",
+    .default = "-")
+) %>%
+  rename(
+    location = MonitoringLocationIdentifier,
+    location_type = MonitoringLocationTypeName,
+    location_name = MonitoringLocationName,
+    location_desc = MonitoringLocationDescriptionText,
+    lat = LatitudeMeasure,
+    long = LongitudeMeasure,
+    )
+
+# This has a bunch of crap in it that has nothing to do with chemicals in
+# water (which might make amphibians gay). Additionally, there are many
+# different units that need to be standardized (eventually to be put in
+# terms of ug/ml)
+result_df <- readr::read_tsv(
+  snakemake@input[["results"]],
+  col_types = cols(
+    ActivityTypeCode = "f",
+    ActivityStartDate = "D",
+    ActivityEndDate = "D",
+    ActivityMediaName = "f",
+    MonitoringLocationIdentifier = "f",
+    CharacteristicName = "f",
+    ResultMeasureValue = "c",
+    "ResultMeasure/MeasureUnitCode" = "c",
+    "DetectionQuantitationLimitMeasure/MeasureValue" = "c",
+    "DetectionQuantitationLimitMeasure/MeasureUnitCode" = "f",
+    .default = "-",
+  )
+) %>%
+  rename(activity = ActivityTypeCode,
+         unit = "ResultMeasure/MeasureUnitCode",
+         media = ActivityMediaName,
+         start = ActivityStartDate,
+         end = ActivityEndDate,
+         location = MonitoringLocationIdentifier,
+         value = ResultMeasureValue,
+         limit = "DetectionQuantitationLimitMeasure/MeasureValue",
+         limit_unit = "DetectionQuantitationLimitMeasure/MeasureUnitCode",
+         species = CharacteristicName) %>%
+  arrange(start) %>%
+  filter(media == "Water") %>%
+  select(-media) %>%
+  # select values that are numbers (some are just descriptive strings) assuming
+  # blanks are actually zeros (to be filtered out later)
+  replace_na(list(value = "0")) %>%
+  filter(str_detect(value, "^-?\\d+\\.?\\d*$")) %>%
+  mutate(value = as.numeric(value)) %>%
+  # select units that are mass concentrations or "counts per something"
+  filter(str_detect(unit, "^.*g/.*(l|L|g)$") |
+           unit %in% c("%", "ppb", "ppm")) %>%
+  # remove a bunch of crap with "%" units
+  filter(! str_detect(species, "SSC|Cloud cover|Sediment|solids|demand")) %>%
+  filter(! str_detect(species, "Pha?eophytin|Chlorophyll|Alkalinity")) %>%
+  filter(species != "Sodium, percent total cations" # not sure what this means
+         & species != "Dissolved oxygen saturation" # whatever
+         & species != "Water" # ironic...
+         & species != "Barometric pressure" # not a chemical
+         & species != "Relative humidity" # not a chemical either
+         & species != "Extract volume" # ''
+         & species != "Volume, total" # ''
+         & species != "Acidity, (H+)" # will change later
+         & species != "Carbon dioxide" # ditto
+         & species != "Dissolved oxygen (DO)" # ditto
+         & species != "Total hardness" # not specific
+         ) %>%
+  # these seems like a typos
+  mutate(species = case_when(
+    species == "Diazinon0" ~ "Diazinon",
+    species == "Phosphate-phosphorus***retired***use Total Phosphorus, mixed forms" ~ "Total Phosphorus, mixed forms",
+    species == "Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite" ~ "Nitrate + Nitrite",
+    TRUE ~ species
+  )) %>%
+  # collapse units to (f/n/u/m)g/L
+  mutate(unit = str_replace(unit, "/l", "/L"),
+         unit = str_replace(unit, "kg", "L"),
+         unit = case_when(unit == "ppb" ~ "ug/L",
+                          unit == "ppm" ~ "mg/L",
+                          TRUE ~ unit),
+         # percent will just be in mg/L
+         value = if_else(unit == "%", value * 10, value),
+         unit = if_else(unit == "%", "mg/L", unit)) %>%
+  # standardize all values to ug/L
+  mutate(std_value = case_when(unit == "g/L" ~ value * 1e6,
+                               unit == "mg/L" ~ value * 1e3,
+                               unit == "ng/L" ~ value / 1e3,
+                               unit == "fg/L" ~ value / 1e6,
+                               TRUE ~ value)) %>%
+  select(-value, -unit)
+
+result_df %>%
+  left_join(site_df, by = "location") %>%
+  select(-location) %>%
+  readr::write_tsv(snakemake@output[[1]])
--- a/workflow/scripts/summarize_ucmr.R
+++ b/workflow/scripts/summarize_ucmr.R
@ -0,0 +1,32 @@
+library(tidyverse)
+
+df <- readr::read_tsv(
+  snakemake@input[[1]],
+  col_types = cols(date = "D", value = "d", .default = "c")
+) %>%
+  filter(PWSID == snakemake@config[["my_pswid"]]) %>%
+  select(-PWSID, -PWSName)
+
+# purification plant sample point
+df %>%
+  filter(sample_point_ID == snakemake@config[["my_plant_id"]]) %>%
+  filter(sample_point_type == "EP") %>%
+  ggplot(aes(value, fct_rev(species))) +
+  geom_jitter() +
+  scale_x_log10() +
+  labs(x = "level (ug/ml)",
+       y = "species",
+       title = "Purification Plant")
+ggsave(snakemake@output[["plant"]])
+
+# tap sample points
+df %>%
+  filter(sample_point_type %in% c("DS", "MR")) %>%
+  ggplot(aes(value, fct_rev(species))) +
+  geom_jitter() +
+  scale_x_log10() +
+  labs(x = "level (ug/ml)",
+       y = "species",
+       title = "Purification Plant")
+ggsave(snakemake@output[["tap"]])
+  
--- a/workflow/scripts/summarize_wqa.R
+++ b/workflow/scripts/summarize_wqa.R
@ -0,0 +1,120 @@
+library(tidyverse)
+
+split_df <- function(df, flt) {
+  .in <- df %>%
+    filter({{ flt }})
+  .out <- df %>%
+    filter(! {{ flt }})
+  list(i = .in,
+       o = .out)
+}
+
+df <- readr::read_tsv(
+  "../../results/wqa/process/all.tsv.gz",
+  col_types = cols(
+    start = "D",
+    species = "f",
+    std_value = "d",
+    lat = "d",
+    long = "d",
+    location_name = "c",
+    .default = "-"
+  )
+) %>%
+  # get rid of the deuterium distinction on some pharma species
+  mutate(species = str_replace(species, "-(d|D)\\d", ""))
+
+not_detected <- df %>%
+  group_by(species) %>%
+  summarize(total = sum(std_value)) %>%
+  filter(total == 0) %>%
+  pull(species)
+
+
+harmless <- c(
+  "Sodium",
+  "Bicarbonate",
+  "Calcium",
+  "Magnesium",
+  "Potassium",
+  "Carbonate",
+  "Oxygen",
+  "Silica"
+)
+
+df_detected <- df %>%
+  filter(! species %in% not_detected) %>%
+  filter(! species %in% harmless)
+
+df %>%
+  filter(lubridate::year(start) > 1990) %>%
+  group_by(species) %>%
+  summarize(fraction = mean(std_value > 0),
+            n = n()) %>%
+  mutate(stderr = sqrt(fraction * (1 - fraction) / n)) %>%
+  filter(n > 3) %>%
+  ggplot(aes(fraction, fct_reorder(species, fraction))) +
+  geom_col() +
+  geom_errorbarh(aes(xmin = fraction - stderr, xmax = fraction + stderr))
+
+metals <- c(
+  "Lithium",
+  "Beryllium",
+  "Boron",
+  "Aluminum",
+  "Vanadium",
+  "Chromium",
+  "Manganese",
+  "Iron",
+  "Cobalt",
+  "Nickel",
+  "Copper",
+  "Zinc",
+  "Arsenic",
+  "Selenium",
+  "Strontium",
+  "Molybdenum",
+  "Silver",
+  "Cadmium",
+  "Antimony",
+  "Barium",
+  "Mercury",
+  "Thallium",
+  "Lead",
+  "Uranium"
+)
+
+halides <- c("Chloride", "Fluoride", "Bromide")
+
+.nitro <- split_df(df_detected, str_detect(species, "(n|N)itr")
+                   | str_detect(species, "Ammon"))
+
+.phospho <- split_df(.nitro$o, str_detect(species, "(P|p)hosph"))
+
+.metal <- split_df(.phospho$o, species %in% metals)
+
+.halides <- split_df(.metal$o, species %in% halides)
+
+.nitro$i %>%
+  ggplot(aes(start, std_value, color = species, group = species)) +
+  geom_line()
+
+.halides$i %>%
+  ggplot(aes(start, std_value)) +
+  geom_point() +
+  facet_wrap(scales = "free", c("species"))
+
+.metal$i %>%
+  ggplot(aes(start, std_value)) +
+  geom_point() +
+  facet_wrap(scales = "free", c("species"))
+
+.phospho$i %>%
+  ggplot(aes(start, std_value)) +
+  geom_point() +
+  facet_wrap(scales = "free", c("species"))
+
+.halides$o %>%
+  filter(std_value > 1) %>%
+  ggplot(aes(std_value, species)) +
+  geom_jitter()
--- a/workflow/scripts/wssc_to_table.py
+++ b/workflow/scripts/wssc_to_table.py
@ -0,0 +1,89 @@
+import csv
+import sys
+import re
+import subprocess as sp
+from datetime import datetime
+from pathlib import Path
+from typing import NamedTuple
+
+
+class CsvRow(NamedTuple):
+    year: int
+    species: str
+    unit: str
+    average_lower: float
+    average_upper: float
+    min_lower: float
+    min_upper: float
+    max_lower: float
+    max_upper: float
+    limit: float
+
+
+def fmt_float(x: str) -> float:
+    if x == "n/d" or x == "n.d":
+        return 0
+    else:
+        # spaces sometimes show up if there is a superscript
+        return float(x.split(" ")[0])
+
+
+def fmt_lt(x: str) -> tuple[float, float]:
+    if "<" == x[0]:
+        return (0, fmt_float(x.removeprefix("<")))
+    else:
+        return (y := fmt_float(x), y)
+
+
+def parse_chemical(year, line: list[str]) -> CsvRow:
+    try:
+        limit = float(re.match("\d+(\.\d+)?", line[5])[0])
+    except (TypeError, ValueError, IndexError):
+        limit = -1
+    a = fmt_lt(line[2])
+    mx = fmt_lt(line[3])
+    mi = fmt_lt(line[4])
+    return CsvRow(
+        year=year,
+        species=line[0],
+        unit=line[1].replace("µ", "u"),
+        average_lower=a[0],
+        average_upper=a[1],
+        min_lower=mi[0],
+        min_upper=mi[1],
+        max_lower=mx[0],
+        max_upper=mx[1],
+        limit=limit,
+    )
+
+
+def parse_pdf(year: int, ipath: Path) -> list[CsvRow]:
+    res = sp.run(
+        ["pdftotext", "-f", "1", "-l", "4", "-r", "1000", "-layout", ipath, "-"],
+        capture_output=True,
+    )
+    if res.returncode == 0:
+        lns = [
+            l.strip()
+            for l in res.stdout.decode().splitlines()
+            if "/L" in l and not " of " in l and not " sample" in l
+        ]
+        chemicals = [
+            s
+            for x in lns
+            if len(s := re.split("\s\s+", x)) > 2 and "Total Organic Carbon" not in s[0]
+        ]
+        return [parse_chemical(year, c) for c in chemicals]
+    else:
+        assert False, res.stderr
+
+
+def main(year: int, ipath: str, opath: str) -> None:
+    rows = parse_pdf(year, ipath)
+    with open(opath, "w") as f:
+        w = csv.writer(f, delimiter="\t")
+        for r in rows:
+            w.writerow(r)
+
+
+main(snakemake.wildcards["year"], snakemake.input[0], snakemake.output[0])