From 26c0dff080ffbdccbb79390fa05cb601574fc1ba Mon Sep 17 00:00:00 2001 From: ndwarshuis Date: Sun, 5 Feb 2023 12:22:00 -0500 Subject: [PATCH] ENH use bytestring to speed up regex matching --- budget.cabal | 2 ++ lib/Internal/Types.hs | 7 +++--- lib/Internal/Utils.hs | 57 +++++++++++++++++++++++++++---------------- package.yaml | 1 + 4 files changed, 43 insertions(+), 24 deletions(-) diff --git a/budget.cabal b/budget.cabal index 90ea7b0..80c9495 100644 --- a/budget.cabal +++ b/budget.cabal @@ -39,6 +39,7 @@ library ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wredundant-constraints -Wpartial-fields -Werror -O2 build-depends: base >=4.12 && <10 + , bytestring , cassava , conduit >=1.3.4.2 , containers >=0.6.4.1 @@ -71,6 +72,7 @@ executable pwncash build-depends: base >=4.12 && <10 , budget + , bytestring , cassava , conduit >=1.3.4.2 , containers >=0.6.4.1 diff --git a/lib/Internal/Types.hs b/lib/Internal/Types.hs index 7e1c2a0..1dcde73 100644 --- a/lib/Internal/Types.hs +++ b/lib/Internal/Types.hs @@ -23,6 +23,7 @@ import Dhall hiding (embed, maybe) import Dhall.TH import Language.Haskell.TH.Syntax (Lift) import RIO +import qualified RIO.ByteString as B import qualified RIO.Map as M import qualified RIO.Text as T import RIO.Time @@ -517,8 +518,8 @@ data AcntPath = AcntPath data TxRecord = TxRecord { trDate :: !Day , trAmount :: !Rational - , trDesc :: !T.Text - , trOther :: !(M.Map T.Text T.Text) + , trDesc :: !B.ByteString + , trOther :: !(M.Map T.Text B.ByteString) , trFile :: !FilePath } deriving (Show, Eq, Ord) @@ -586,7 +587,7 @@ data InsertError | MatchValPrecisionError !Natural !Natural | InsertIOError !T.Text | ParseError !T.Text - | ConversionError !T.Text + | ConversionError !B.ByteString | LookupError !LookupSuberr !T.Text | BalanceError !BalanceType !CurID ![RawSplit] | IncomeError !DatePat diff --git a/lib/Internal/Utils.hs b/lib/Internal/Utils.hs index aa3446b..d23b98b 100644 --- a/lib/Internal/Utils.hs +++ b/lib/Internal/Utils.hs @@ -38,16 +38,18 @@ module Internal.Utils ) where +import qualified Data.ByteString.Char8 as B8 import Data.Time.Format.ISO8601 import GHC.Real import Internal.Types import RIO +import qualified RIO.ByteString as B import qualified RIO.List as L import qualified RIO.Map as M import qualified RIO.Text as T import RIO.Time import Text.Regex.TDFA -import Text.Regex.TDFA.Text +import Text.Regex.TDFA.ByteString -------------------------------------------------------------------------------- -- dates @@ -172,7 +174,7 @@ toTx sc sa toSplits r@TxRecord {..} = in Tx { txTags = [] , txDate = trDate - , txDescr = trDesc + , txDescr = decodeUtf8Lenient trDesc , txSplits = fromSplit : ss_ } where @@ -196,7 +198,7 @@ valMatches MatchVal {..} x dateMatches :: MatchDate -> Day -> Bool dateMatches md = (EQ ==) . compareDate md -otherMatches :: M.Map T.Text T.Text -> MatchOtherRe -> EitherErr Bool +otherMatches :: M.Map T.Text B.ByteString -> MatchOtherRe -> EitherErr Bool otherMatches dict m = case m of Val (Field n mv) -> valMatches mv =<< (readRational =<< lookup_ MatchNumeric n) Desc (Field n (_, md)) -> (`matchMaybe` md) =<< lookup_ MatchText n @@ -226,13 +228,13 @@ resolveCurrency = resolveSplitField CurField resolveSplitField :: SplitIDType -> TxRecord -> SplitAcnt -> EitherErrs T.Text resolveSplitField t TxRecord {trOther = o} s = case s of ConstT p -> Right p - LookupT f -> plural $ lookup_ f o + LookupT f -> decodeUtf8Lenient <$> plural (lookup_ f o) MapT (Field f m) -> plural $ do k <- lookup_ f o - lookup_ k m + lookup_ (decodeUtf8Lenient k) m Map2T (Field (f1, f2) m) -> do (k1, k2) <- concatEither2 (lookup_ f1 o) (lookup_ f2 o) (,) - plural $ lookup_ (k1, k2) m + plural $ lookup_ (decodeUtf8Lenient k1, decodeUtf8Lenient k2) m where lookup_ :: (Ord k, Show k) => k -> M.Map k v -> EitherErr v lookup_ = lookupErr (SplitIDField t) @@ -242,46 +244,58 @@ lookupErr what k m = case M.lookup k m of Just x -> Right x _ -> Left $ LookupError what $ showT k -parseRational :: MonadFail m => (T.Text, Regex) -> T.Text -> m Rational +parseRational :: MonadFail m => (T.Text, Regex) -> B.ByteString -> m Rational parseRational (pat, re) s = case matchGroupsMaybe s re of [sign, x, ""] -> uncurry (*) <$> readWhole sign x [sign, x, y] -> do d <- readT "decimal" y - let p = 10 ^ T.length y + let p = 10 ^ B.length y (k, w) <- readWhole sign x return $ k * (w + d % p) _ -> msg "malformed decimal" where - readT what t = case readMaybe $ T.unpack t of + readT what t = case readWholeInteger t of Just d -> return $ fromInteger d - _ -> msg $ T.unwords ["could not parse", what, t] + _ -> msg $ T.unwords ["could not parse", what, decodeUtf8Lenient t] msg m = fail $ T.unpack $ - T.unwords [m, "-", keyVals [("pattern", pat), ("query", s)]] + T.unwords [m, "-", keyVals [("pattern", pat), ("query", decodeUtf8Lenient s)]] readSign x | x == "-" = return (-1) | x == "+" || x == "" = return 1 - | otherwise = msg $ T.append "invalid sign: " x + | otherwise = msg $ T.append "invalid sign: " $ decodeUtf8Lenient x readWhole sign x = do w <- readT "whole number" x k <- readSign sign return (k, w) -readRational :: T.Text -> EitherErr Rational -readRational s = case T.split (== '.') s of +readRational :: B.ByteString -> EitherErr Rational +readRational s = case B.split 46 s of -- 46 is '.' [x] -> maybe err (return . fromInteger) $ readT x [x, y] -> case (readT x, readT y) of (Just x', Just y') -> - let p = 10 ^ T.length y + let p = 10 ^ B.length y k = if x' >= 0 then 1 else -1 in return $ fromInteger x' + k * y' % p _ -> err _ -> err where - readT = readMaybe . T.unpack + readT i = case B8.readInteger i of + Just (x, "") -> Just x + _ -> Nothing err = Left $ ConversionError s +readWholeInteger :: B.ByteString -> Maybe Integer +readWholeInteger s = case B8.readInteger s of + Just (x, "") -> Just x + _ -> Nothing + +-- readWholeInt :: B.ByteString -> Maybe Int +-- readWholeInt s = case B8.readInt s of +-- Just (x, "") -> Just x +-- _ -> Nothing + -- TODO smells like a lens -- mapTxSplits :: (a -> b) -> Tx a -> Tx b -- mapTxSplits f t@Tx {txSplits = ss} = t {txSplits = fmap f ss} @@ -332,7 +346,8 @@ showError other = (: []) $ case other of ZeroLength -> "Zero repeat length" ZeroRepeats -> "Zero repeats" (RegexError re) -> T.append "could not make regex from pattern: " re - (ConversionError x) -> T.append "Could not convert to rational number: " x + (ConversionError x) -> + T.append "Could not convert to rational number: " $ decodeUtf8Lenient x (InsertIOError msg) -> T.append "IO Error: " msg (ParseError msg) -> T.append "Parse Error: " msg (MatchValPrecisionError d p) -> @@ -375,7 +390,7 @@ showTx TxRecord {trDate = d, trAmount = v, trDesc = e, trFile = f} = [ ("path", T.pack f) , ("date", T.pack $ iso8601Show d) , ("value", showT (fromRational v :: Float)) - , ("description", doubleQuote e) + , ("description", doubleQuote $ decodeUtf8Lenient e) ] showMatch :: MatchRe -> T.Text @@ -584,14 +599,14 @@ compileRegex groups pat = case res of compile (blankCompOpt {newSyntax = True}) (blankExecOpt {captureGroups = groups}) - pat + (encodeUtf8 pat) -matchMaybe :: T.Text -> Regex -> EitherErr Bool +matchMaybe :: B.ByteString -> Regex -> EitherErr Bool matchMaybe q re = case execute re q of Right res -> Right $ isJust res Left _ -> Left $ RegexError "this should not happen" -matchGroupsMaybe :: T.Text -> Regex -> [T.Text] +matchGroupsMaybe :: B.ByteString -> Regex -> [B.ByteString] matchGroupsMaybe q re = case regexec re q of Right Nothing -> [] Right (Just (_, _, _, xs)) -> xs diff --git a/package.yaml b/package.yaml index fe911f1..2d68c2b 100644 --- a/package.yaml +++ b/package.yaml @@ -41,6 +41,7 @@ dependencies: - recursion-schemes - data-fix - filepath +- bytestring library: source-dirs: lib/