moco-water/workflow/scripts/wssc_to_table.py

90 lines
2.2 KiB
Python

import csv
import sys
import re
import subprocess as sp
from datetime import datetime
from pathlib import Path
from typing import NamedTuple
class CsvRow(NamedTuple):
year: int
species: str
unit: str
average_lower: float
average_upper: float
min_lower: float
min_upper: float
max_lower: float
max_upper: float
limit: float
def fmt_float(x: str) -> float:
if x == "n/d" or x == "n.d":
return 0
else:
# spaces sometimes show up if there is a superscript
return float(x.split(" ")[0])
def fmt_lt(x: str) -> tuple[float, float]:
if "<" == x[0]:
return (0, fmt_float(x.removeprefix("<")))
else:
return (y := fmt_float(x), y)
def parse_chemical(year, line: list[str]) -> CsvRow:
try:
limit = float(re.match("\d+(\.\d+)?", line[5])[0])
except (TypeError, ValueError, IndexError):
limit = -1
a = fmt_lt(line[2])
mx = fmt_lt(line[3])
mi = fmt_lt(line[4])
return CsvRow(
year=year,
species=line[0],
unit=line[1].replace("µ", "u"),
average_lower=a[0],
average_upper=a[1],
min_lower=mi[0],
min_upper=mi[1],
max_lower=mx[0],
max_upper=mx[1],
limit=limit,
)
def parse_pdf(year: int, ipath: Path) -> list[CsvRow]:
res = sp.run(
["pdftotext", "-f", "1", "-l", "4", "-r", "1000", "-layout", ipath, "-"],
capture_output=True,
)
if res.returncode == 0:
lns = [
l.strip()
for l in res.stdout.decode().splitlines()
if "/L" in l and not " of " in l and not " sample" in l
]
chemicals = [
s
for x in lns
if len(s := re.split("\s\s+", x)) > 2 and "Total Organic Carbon" not in s[0]
]
return [parse_chemical(year, c) for c in chemicals]
else:
assert False, res.stderr
def main(year: int, ipath: str, opath: str) -> None:
rows = parse_pdf(year, ipath)
with open(opath, "w") as f:
w = csv.writer(f, delimiter="\t")
for r in rows:
w.writerow(r)
main(snakemake.wildcards["year"], snakemake.input[0], snakemake.output[0])