90 lines
2.2 KiB
Python
90 lines
2.2 KiB
Python
|
import csv
|
||
|
import sys
|
||
|
import re
|
||
|
import subprocess as sp
|
||
|
from datetime import datetime
|
||
|
from pathlib import Path
|
||
|
from typing import NamedTuple
|
||
|
|
||
|
|
||
|
class CsvRow(NamedTuple):
|
||
|
year: int
|
||
|
species: str
|
||
|
unit: str
|
||
|
average_lower: float
|
||
|
average_upper: float
|
||
|
min_lower: float
|
||
|
min_upper: float
|
||
|
max_lower: float
|
||
|
max_upper: float
|
||
|
limit: float
|
||
|
|
||
|
|
||
|
def fmt_float(x: str) -> float:
|
||
|
if x == "n/d" or x == "n.d":
|
||
|
return 0
|
||
|
else:
|
||
|
# spaces sometimes show up if there is a superscript
|
||
|
return float(x.split(" ")[0])
|
||
|
|
||
|
|
||
|
def fmt_lt(x: str) -> tuple[float, float]:
|
||
|
if "<" == x[0]:
|
||
|
return (0, fmt_float(x.removeprefix("<")))
|
||
|
else:
|
||
|
return (y := fmt_float(x), y)
|
||
|
|
||
|
|
||
|
def parse_chemical(year, line: list[str]) -> CsvRow:
|
||
|
try:
|
||
|
limit = float(re.match("\d+(\.\d+)?", line[5])[0])
|
||
|
except (TypeError, ValueError, IndexError):
|
||
|
limit = -1
|
||
|
a = fmt_lt(line[2])
|
||
|
mx = fmt_lt(line[3])
|
||
|
mi = fmt_lt(line[4])
|
||
|
return CsvRow(
|
||
|
year=year,
|
||
|
species=line[0],
|
||
|
unit=line[1].replace("µ", "u"),
|
||
|
average_lower=a[0],
|
||
|
average_upper=a[1],
|
||
|
min_lower=mi[0],
|
||
|
min_upper=mi[1],
|
||
|
max_lower=mx[0],
|
||
|
max_upper=mx[1],
|
||
|
limit=limit,
|
||
|
)
|
||
|
|
||
|
|
||
|
def parse_pdf(year: int, ipath: Path) -> list[CsvRow]:
|
||
|
res = sp.run(
|
||
|
["pdftotext", "-f", "1", "-l", "4", "-r", "1000", "-layout", ipath, "-"],
|
||
|
capture_output=True,
|
||
|
)
|
||
|
if res.returncode == 0:
|
||
|
lns = [
|
||
|
l.strip()
|
||
|
for l in res.stdout.decode().splitlines()
|
||
|
if "/L" in l and not " of " in l and not " sample" in l
|
||
|
]
|
||
|
chemicals = [
|
||
|
s
|
||
|
for x in lns
|
||
|
if len(s := re.split("\s\s+", x)) > 2 and "Total Organic Carbon" not in s[0]
|
||
|
]
|
||
|
return [parse_chemical(year, c) for c in chemicals]
|
||
|
else:
|
||
|
assert False, res.stderr
|
||
|
|
||
|
|
||
|
def main(year: int, ipath: str, opath: str) -> None:
|
||
|
rows = parse_pdf(year, ipath)
|
||
|
with open(opath, "w") as f:
|
||
|
w = csv.writer(f, delimiter="\t")
|
||
|
for r in rows:
|
||
|
w.writerow(r)
|
||
|
|
||
|
|
||
|
main(snakemake.wildcards["year"], snakemake.input[0], snakemake.output[0])
|