moco-water/workflow/scripts/standardize_wqa.R

108 lines
4.3 KiB
R
Raw Permalink Normal View History

2023-04-05 21:00:46 -04:00
library(tidyverse)
site_df <- readr::read_tsv(
snakemake@input[["station"]],
col_types = cols(
MonitoringLocationIdentifier = "f",
MonitoringLocationTypeName = "f",
MonitoringLocationName = "f",
MonitoringLocationDescriptionText = "c",
LatitudeMeasure = "d",
LongitudeMeasure = "d",
.default = "-")
) %>%
rename(
location = MonitoringLocationIdentifier,
location_type = MonitoringLocationTypeName,
location_name = MonitoringLocationName,
location_desc = MonitoringLocationDescriptionText,
lat = LatitudeMeasure,
long = LongitudeMeasure,
)
# This has a bunch of crap in it that has nothing to do with chemicals in
# water (which might make amphibians gay). Additionally, there are many
# different units that need to be standardized (eventually to be put in
# terms of ug/ml)
result_df <- readr::read_tsv(
snakemake@input[["results"]],
col_types = cols(
ActivityTypeCode = "f",
ActivityStartDate = "D",
ActivityEndDate = "D",
ActivityMediaName = "f",
MonitoringLocationIdentifier = "f",
CharacteristicName = "f",
ResultMeasureValue = "c",
"ResultMeasure/MeasureUnitCode" = "c",
"DetectionQuantitationLimitMeasure/MeasureValue" = "c",
"DetectionQuantitationLimitMeasure/MeasureUnitCode" = "f",
.default = "-",
)
) %>%
rename(activity = ActivityTypeCode,
unit = "ResultMeasure/MeasureUnitCode",
media = ActivityMediaName,
start = ActivityStartDate,
end = ActivityEndDate,
location = MonitoringLocationIdentifier,
value = ResultMeasureValue,
limit = "DetectionQuantitationLimitMeasure/MeasureValue",
limit_unit = "DetectionQuantitationLimitMeasure/MeasureUnitCode",
species = CharacteristicName) %>%
arrange(start) %>%
filter(media == "Water") %>%
select(-media) %>%
# select values that are numbers (some are just descriptive strings) assuming
# blanks are actually zeros (to be filtered out later)
replace_na(list(value = "0")) %>%
filter(str_detect(value, "^-?\\d+\\.?\\d*$")) %>%
mutate(value = as.numeric(value)) %>%
# select units that are mass concentrations or "counts per something"
filter(str_detect(unit, "^.*g/.*(l|L|g)$") |
unit %in% c("%", "ppb", "ppm")) %>%
# remove a bunch of crap with "%" units
filter(! str_detect(species, "SSC|Cloud cover|Sediment|solids|demand")) %>%
filter(! str_detect(species, "Pha?eophytin|Chlorophyll|Alkalinity")) %>%
filter(species != "Sodium, percent total cations" # not sure what this means
& species != "Dissolved oxygen saturation" # whatever
& species != "Water" # ironic...
& species != "Barometric pressure" # not a chemical
& species != "Relative humidity" # not a chemical either
& species != "Extract volume" # ''
& species != "Volume, total" # ''
& species != "Acidity, (H+)" # will change later
& species != "Carbon dioxide" # ditto
& species != "Dissolved oxygen (DO)" # ditto
& species != "Total hardness" # not specific
) %>%
# these seems like a typos
mutate(species = case_when(
species == "Diazinon0" ~ "Diazinon",
species == "Phosphate-phosphorus***retired***use Total Phosphorus, mixed forms" ~ "Total Phosphorus, mixed forms",
species == "Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite" ~ "Nitrate + Nitrite",
TRUE ~ species
)) %>%
# collapse units to (f/n/u/m)g/L
mutate(unit = str_replace(unit, "/l", "/L"),
unit = str_replace(unit, "kg", "L"),
unit = case_when(unit == "ppb" ~ "ug/L",
unit == "ppm" ~ "mg/L",
TRUE ~ unit),
# percent will just be in mg/L
value = if_else(unit == "%", value * 10, value),
unit = if_else(unit == "%", "mg/L", unit)) %>%
# standardize all values to ug/L
mutate(std_value = case_when(unit == "g/L" ~ value * 1e6,
unit == "mg/L" ~ value * 1e3,
unit == "ng/L" ~ value / 1e3,
unit == "fg/L" ~ value / 1e6,
TRUE ~ value)) %>%
select(-value, -unit)
result_df %>%
left_join(site_df, by = "location") %>%
select(-location) %>%
readr::write_tsv(snakemake@output[[1]])