library(tidyverse) site_df <- readr::read_tsv( snakemake@input[["station"]], col_types = cols( MonitoringLocationIdentifier = "f", MonitoringLocationTypeName = "f", MonitoringLocationName = "f", MonitoringLocationDescriptionText = "c", LatitudeMeasure = "d", LongitudeMeasure = "d", .default = "-") ) %>% rename( location = MonitoringLocationIdentifier, location_type = MonitoringLocationTypeName, location_name = MonitoringLocationName, location_desc = MonitoringLocationDescriptionText, lat = LatitudeMeasure, long = LongitudeMeasure, ) # This has a bunch of crap in it that has nothing to do with chemicals in # water (which might make amphibians gay). Additionally, there are many # different units that need to be standardized (eventually to be put in # terms of ug/ml) result_df <- readr::read_tsv( snakemake@input[["results"]], col_types = cols( ActivityTypeCode = "f", ActivityStartDate = "D", ActivityEndDate = "D", ActivityMediaName = "f", MonitoringLocationIdentifier = "f", CharacteristicName = "f", ResultMeasureValue = "c", "ResultMeasure/MeasureUnitCode" = "c", "DetectionQuantitationLimitMeasure/MeasureValue" = "c", "DetectionQuantitationLimitMeasure/MeasureUnitCode" = "f", .default = "-", ) ) %>% rename(activity = ActivityTypeCode, unit = "ResultMeasure/MeasureUnitCode", media = ActivityMediaName, start = ActivityStartDate, end = ActivityEndDate, location = MonitoringLocationIdentifier, value = ResultMeasureValue, limit = "DetectionQuantitationLimitMeasure/MeasureValue", limit_unit = "DetectionQuantitationLimitMeasure/MeasureUnitCode", species = CharacteristicName) %>% arrange(start) %>% filter(media == "Water") %>% select(-media) %>% # select values that are numbers (some are just descriptive strings) assuming # blanks are actually zeros (to be filtered out later) replace_na(list(value = "0")) %>% filter(str_detect(value, "^-?\\d+\\.?\\d*$")) %>% mutate(value = as.numeric(value)) %>% # select units that are mass concentrations or "counts per something" filter(str_detect(unit, "^.*g/.*(l|L|g)$") | unit %in% c("%", "ppb", "ppm")) %>% # remove a bunch of crap with "%" units filter(! str_detect(species, "SSC|Cloud cover|Sediment|solids|demand")) %>% filter(! str_detect(species, "Pha?eophytin|Chlorophyll|Alkalinity")) %>% filter(species != "Sodium, percent total cations" # not sure what this means & species != "Dissolved oxygen saturation" # whatever & species != "Water" # ironic... & species != "Barometric pressure" # not a chemical & species != "Relative humidity" # not a chemical either & species != "Extract volume" # '' & species != "Volume, total" # '' & species != "Acidity, (H+)" # will change later & species != "Carbon dioxide" # ditto & species != "Dissolved oxygen (DO)" # ditto & species != "Total hardness" # not specific ) %>% # these seems like a typos mutate(species = case_when( species == "Diazinon0" ~ "Diazinon", species == "Phosphate-phosphorus***retired***use Total Phosphorus, mixed forms" ~ "Total Phosphorus, mixed forms", species == "Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite" ~ "Nitrate + Nitrite", TRUE ~ species )) %>% # collapse units to (f/n/u/m)g/L mutate(unit = str_replace(unit, "/l", "/L"), unit = str_replace(unit, "kg", "L"), unit = case_when(unit == "ppb" ~ "ug/L", unit == "ppm" ~ "mg/L", TRUE ~ unit), # percent will just be in mg/L value = if_else(unit == "%", value * 10, value), unit = if_else(unit == "%", "mg/L", unit)) %>% # standardize all values to ug/L mutate(std_value = case_when(unit == "g/L" ~ value * 1e6, unit == "mg/L" ~ value * 1e3, unit == "ng/L" ~ value / 1e3, unit == "fg/L" ~ value / 1e6, TRUE ~ value)) %>% select(-value, -unit) result_df %>% left_join(site_df, by = "location") %>% select(-location) %>% readr::write_tsv(snakemake@output[[1]])