ENH use bytestring to speed up regex matching

This commit is contained in:
Nathan Dwarshuis 2023-02-05 12:22:00 -05:00
parent bd72bec920
commit 26c0dff080
4 changed files with 43 additions and 24 deletions

View File

@ -39,6 +39,7 @@ library
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wredundant-constraints -Wpartial-fields -Werror -O2 ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wredundant-constraints -Wpartial-fields -Werror -O2
build-depends: build-depends:
base >=4.12 && <10 base >=4.12 && <10
, bytestring
, cassava , cassava
, conduit >=1.3.4.2 , conduit >=1.3.4.2
, containers >=0.6.4.1 , containers >=0.6.4.1
@ -71,6 +72,7 @@ executable pwncash
build-depends: build-depends:
base >=4.12 && <10 base >=4.12 && <10
, budget , budget
, bytestring
, cassava , cassava
, conduit >=1.3.4.2 , conduit >=1.3.4.2
, containers >=0.6.4.1 , containers >=0.6.4.1

View File

@ -23,6 +23,7 @@ import Dhall hiding (embed, maybe)
import Dhall.TH import Dhall.TH
import Language.Haskell.TH.Syntax (Lift) import Language.Haskell.TH.Syntax (Lift)
import RIO import RIO
import qualified RIO.ByteString as B
import qualified RIO.Map as M import qualified RIO.Map as M
import qualified RIO.Text as T import qualified RIO.Text as T
import RIO.Time import RIO.Time
@ -517,8 +518,8 @@ data AcntPath = AcntPath
data TxRecord = TxRecord data TxRecord = TxRecord
{ trDate :: !Day { trDate :: !Day
, trAmount :: !Rational , trAmount :: !Rational
, trDesc :: !T.Text , trDesc :: !B.ByteString
, trOther :: !(M.Map T.Text T.Text) , trOther :: !(M.Map T.Text B.ByteString)
, trFile :: !FilePath , trFile :: !FilePath
} }
deriving (Show, Eq, Ord) deriving (Show, Eq, Ord)
@ -586,7 +587,7 @@ data InsertError
| MatchValPrecisionError !Natural !Natural | MatchValPrecisionError !Natural !Natural
| InsertIOError !T.Text | InsertIOError !T.Text
| ParseError !T.Text | ParseError !T.Text
| ConversionError !T.Text | ConversionError !B.ByteString
| LookupError !LookupSuberr !T.Text | LookupError !LookupSuberr !T.Text
| BalanceError !BalanceType !CurID ![RawSplit] | BalanceError !BalanceType !CurID ![RawSplit]
| IncomeError !DatePat | IncomeError !DatePat

View File

@ -38,16 +38,18 @@ module Internal.Utils
) )
where where
import qualified Data.ByteString.Char8 as B8
import Data.Time.Format.ISO8601 import Data.Time.Format.ISO8601
import GHC.Real import GHC.Real
import Internal.Types import Internal.Types
import RIO import RIO
import qualified RIO.ByteString as B
import qualified RIO.List as L import qualified RIO.List as L
import qualified RIO.Map as M import qualified RIO.Map as M
import qualified RIO.Text as T import qualified RIO.Text as T
import RIO.Time import RIO.Time
import Text.Regex.TDFA import Text.Regex.TDFA
import Text.Regex.TDFA.Text import Text.Regex.TDFA.ByteString
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- dates -- dates
@ -172,7 +174,7 @@ toTx sc sa toSplits r@TxRecord {..} =
in Tx in Tx
{ txTags = [] { txTags = []
, txDate = trDate , txDate = trDate
, txDescr = trDesc , txDescr = decodeUtf8Lenient trDesc
, txSplits = fromSplit : ss_ , txSplits = fromSplit : ss_
} }
where where
@ -196,7 +198,7 @@ valMatches MatchVal {..} x
dateMatches :: MatchDate -> Day -> Bool dateMatches :: MatchDate -> Day -> Bool
dateMatches md = (EQ ==) . compareDate md dateMatches md = (EQ ==) . compareDate md
otherMatches :: M.Map T.Text T.Text -> MatchOtherRe -> EitherErr Bool otherMatches :: M.Map T.Text B.ByteString -> MatchOtherRe -> EitherErr Bool
otherMatches dict m = case m of otherMatches dict m = case m of
Val (Field n mv) -> valMatches mv =<< (readRational =<< lookup_ MatchNumeric n) Val (Field n mv) -> valMatches mv =<< (readRational =<< lookup_ MatchNumeric n)
Desc (Field n (_, md)) -> (`matchMaybe` md) =<< lookup_ MatchText n Desc (Field n (_, md)) -> (`matchMaybe` md) =<< lookup_ MatchText n
@ -226,13 +228,13 @@ resolveCurrency = resolveSplitField CurField
resolveSplitField :: SplitIDType -> TxRecord -> SplitAcnt -> EitherErrs T.Text resolveSplitField :: SplitIDType -> TxRecord -> SplitAcnt -> EitherErrs T.Text
resolveSplitField t TxRecord {trOther = o} s = case s of resolveSplitField t TxRecord {trOther = o} s = case s of
ConstT p -> Right p ConstT p -> Right p
LookupT f -> plural $ lookup_ f o LookupT f -> decodeUtf8Lenient <$> plural (lookup_ f o)
MapT (Field f m) -> plural $ do MapT (Field f m) -> plural $ do
k <- lookup_ f o k <- lookup_ f o
lookup_ k m lookup_ (decodeUtf8Lenient k) m
Map2T (Field (f1, f2) m) -> do Map2T (Field (f1, f2) m) -> do
(k1, k2) <- concatEither2 (lookup_ f1 o) (lookup_ f2 o) (,) (k1, k2) <- concatEither2 (lookup_ f1 o) (lookup_ f2 o) (,)
plural $ lookup_ (k1, k2) m plural $ lookup_ (decodeUtf8Lenient k1, decodeUtf8Lenient k2) m
where where
lookup_ :: (Ord k, Show k) => k -> M.Map k v -> EitherErr v lookup_ :: (Ord k, Show k) => k -> M.Map k v -> EitherErr v
lookup_ = lookupErr (SplitIDField t) lookup_ = lookupErr (SplitIDField t)
@ -242,46 +244,58 @@ lookupErr what k m = case M.lookup k m of
Just x -> Right x Just x -> Right x
_ -> Left $ LookupError what $ showT k _ -> Left $ LookupError what $ showT k
parseRational :: MonadFail m => (T.Text, Regex) -> T.Text -> m Rational parseRational :: MonadFail m => (T.Text, Regex) -> B.ByteString -> m Rational
parseRational (pat, re) s = case matchGroupsMaybe s re of parseRational (pat, re) s = case matchGroupsMaybe s re of
[sign, x, ""] -> uncurry (*) <$> readWhole sign x [sign, x, ""] -> uncurry (*) <$> readWhole sign x
[sign, x, y] -> do [sign, x, y] -> do
d <- readT "decimal" y d <- readT "decimal" y
let p = 10 ^ T.length y let p = 10 ^ B.length y
(k, w) <- readWhole sign x (k, w) <- readWhole sign x
return $ k * (w + d % p) return $ k * (w + d % p)
_ -> msg "malformed decimal" _ -> msg "malformed decimal"
where where
readT what t = case readMaybe $ T.unpack t of readT what t = case readWholeInteger t of
Just d -> return $ fromInteger d Just d -> return $ fromInteger d
_ -> msg $ T.unwords ["could not parse", what, t] _ -> msg $ T.unwords ["could not parse", what, decodeUtf8Lenient t]
msg m = msg m =
fail $ fail $
T.unpack $ T.unpack $
T.unwords [m, "-", keyVals [("pattern", pat), ("query", s)]] T.unwords [m, "-", keyVals [("pattern", pat), ("query", decodeUtf8Lenient s)]]
readSign x readSign x
| x == "-" = return (-1) | x == "-" = return (-1)
| x == "+" || x == "" = return 1 | x == "+" || x == "" = return 1
| otherwise = msg $ T.append "invalid sign: " x | otherwise = msg $ T.append "invalid sign: " $ decodeUtf8Lenient x
readWhole sign x = do readWhole sign x = do
w <- readT "whole number" x w <- readT "whole number" x
k <- readSign sign k <- readSign sign
return (k, w) return (k, w)
readRational :: T.Text -> EitherErr Rational readRational :: B.ByteString -> EitherErr Rational
readRational s = case T.split (== '.') s of readRational s = case B.split 46 s of -- 46 is '.'
[x] -> maybe err (return . fromInteger) $ readT x [x] -> maybe err (return . fromInteger) $ readT x
[x, y] -> case (readT x, readT y) of [x, y] -> case (readT x, readT y) of
(Just x', Just y') -> (Just x', Just y') ->
let p = 10 ^ T.length y let p = 10 ^ B.length y
k = if x' >= 0 then 1 else -1 k = if x' >= 0 then 1 else -1
in return $ fromInteger x' + k * y' % p in return $ fromInteger x' + k * y' % p
_ -> err _ -> err
_ -> err _ -> err
where where
readT = readMaybe . T.unpack readT i = case B8.readInteger i of
Just (x, "") -> Just x
_ -> Nothing
err = Left $ ConversionError s err = Left $ ConversionError s
readWholeInteger :: B.ByteString -> Maybe Integer
readWholeInteger s = case B8.readInteger s of
Just (x, "") -> Just x
_ -> Nothing
-- readWholeInt :: B.ByteString -> Maybe Int
-- readWholeInt s = case B8.readInt s of
-- Just (x, "") -> Just x
-- _ -> Nothing
-- TODO smells like a lens -- TODO smells like a lens
-- mapTxSplits :: (a -> b) -> Tx a -> Tx b -- mapTxSplits :: (a -> b) -> Tx a -> Tx b
-- mapTxSplits f t@Tx {txSplits = ss} = t {txSplits = fmap f ss} -- mapTxSplits f t@Tx {txSplits = ss} = t {txSplits = fmap f ss}
@ -332,7 +346,8 @@ showError other = (: []) $ case other of
ZeroLength -> "Zero repeat length" ZeroLength -> "Zero repeat length"
ZeroRepeats -> "Zero repeats" ZeroRepeats -> "Zero repeats"
(RegexError re) -> T.append "could not make regex from pattern: " re (RegexError re) -> T.append "could not make regex from pattern: " re
(ConversionError x) -> T.append "Could not convert to rational number: " x (ConversionError x) ->
T.append "Could not convert to rational number: " $ decodeUtf8Lenient x
(InsertIOError msg) -> T.append "IO Error: " msg (InsertIOError msg) -> T.append "IO Error: " msg
(ParseError msg) -> T.append "Parse Error: " msg (ParseError msg) -> T.append "Parse Error: " msg
(MatchValPrecisionError d p) -> (MatchValPrecisionError d p) ->
@ -375,7 +390,7 @@ showTx TxRecord {trDate = d, trAmount = v, trDesc = e, trFile = f} =
[ ("path", T.pack f) [ ("path", T.pack f)
, ("date", T.pack $ iso8601Show d) , ("date", T.pack $ iso8601Show d)
, ("value", showT (fromRational v :: Float)) , ("value", showT (fromRational v :: Float))
, ("description", doubleQuote e) , ("description", doubleQuote $ decodeUtf8Lenient e)
] ]
showMatch :: MatchRe -> T.Text showMatch :: MatchRe -> T.Text
@ -584,14 +599,14 @@ compileRegex groups pat = case res of
compile compile
(blankCompOpt {newSyntax = True}) (blankCompOpt {newSyntax = True})
(blankExecOpt {captureGroups = groups}) (blankExecOpt {captureGroups = groups})
pat (encodeUtf8 pat)
matchMaybe :: T.Text -> Regex -> EitherErr Bool matchMaybe :: B.ByteString -> Regex -> EitherErr Bool
matchMaybe q re = case execute re q of matchMaybe q re = case execute re q of
Right res -> Right $ isJust res Right res -> Right $ isJust res
Left _ -> Left $ RegexError "this should not happen" Left _ -> Left $ RegexError "this should not happen"
matchGroupsMaybe :: T.Text -> Regex -> [T.Text] matchGroupsMaybe :: B.ByteString -> Regex -> [B.ByteString]
matchGroupsMaybe q re = case regexec re q of matchGroupsMaybe q re = case regexec re q of
Right Nothing -> [] Right Nothing -> []
Right (Just (_, _, _, xs)) -> xs Right (Just (_, _, _, xs)) -> xs

View File

@ -41,6 +41,7 @@ dependencies:
- recursion-schemes - recursion-schemes
- data-fix - data-fix
- filepath - filepath
- bytestring
library: library:
source-dirs: lib/ source-dirs: lib/