import csv import sys import re import subprocess as sp from datetime import datetime from pathlib import Path from typing import NamedTuple class CsvRow(NamedTuple): year: int species: str unit: str average_lower: float average_upper: float min_lower: float min_upper: float max_lower: float max_upper: float limit: float def fmt_float(x: str) -> float: if x == "n/d" or x == "n.d": return 0 else: # spaces sometimes show up if there is a superscript return float(x.split(" ")[0]) def fmt_lt(x: str) -> tuple[float, float]: if "<" == x[0]: return (0, fmt_float(x.removeprefix("<"))) else: return (y := fmt_float(x), y) def parse_chemical(year, line: list[str]) -> CsvRow: try: limit = float(re.match("\d+(\.\d+)?", line[5])[0]) except (TypeError, ValueError, IndexError): limit = -1 a = fmt_lt(line[2]) mx = fmt_lt(line[3]) mi = fmt_lt(line[4]) return CsvRow( year=year, species=line[0], unit=line[1].replace("ยต", "u"), average_lower=a[0], average_upper=a[1], min_lower=mi[0], min_upper=mi[1], max_lower=mx[0], max_upper=mx[1], limit=limit, ) def parse_pdf(year: int, ipath: Path) -> list[CsvRow]: res = sp.run( ["pdftotext", "-f", "1", "-l", "4", "-r", "1000", "-layout", ipath, "-"], capture_output=True, ) if res.returncode == 0: lns = [ l.strip() for l in res.stdout.decode().splitlines() if "/L" in l and not " of " in l and not " sample" in l ] chemicals = [ s for x in lns if len(s := re.split("\s\s+", x)) > 2 and "Total Organic Carbon" not in s[0] ] return [parse_chemical(year, c) for c in chemicals] else: assert False, res.stderr def main(year: int, ipath: str, opath: str) -> None: rows = parse_pdf(year, ipath) with open(opath, "w") as f: w = csv.writer(f, delimiter="\t") for r in rows: w.writerow(r) main(snakemake.wildcards["year"], snakemake.input[0], snakemake.output[0])