ENH use bytestring to speed up regex matching

This commit is contained in:
Nathan Dwarshuis 2023-02-05 12:22:00 -05:00
parent bd72bec920
commit 26c0dff080
4 changed files with 43 additions and 24 deletions

View File

@ -39,6 +39,7 @@ library
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wredundant-constraints -Wpartial-fields -Werror -O2
build-depends:
base >=4.12 && <10
, bytestring
, cassava
, conduit >=1.3.4.2
, containers >=0.6.4.1
@ -71,6 +72,7 @@ executable pwncash
build-depends:
base >=4.12 && <10
, budget
, bytestring
, cassava
, conduit >=1.3.4.2
, containers >=0.6.4.1

View File

@ -23,6 +23,7 @@ import Dhall hiding (embed, maybe)
import Dhall.TH
import Language.Haskell.TH.Syntax (Lift)
import RIO
import qualified RIO.ByteString as B
import qualified RIO.Map as M
import qualified RIO.Text as T
import RIO.Time
@ -517,8 +518,8 @@ data AcntPath = AcntPath
data TxRecord = TxRecord
{ trDate :: !Day
, trAmount :: !Rational
, trDesc :: !T.Text
, trOther :: !(M.Map T.Text T.Text)
, trDesc :: !B.ByteString
, trOther :: !(M.Map T.Text B.ByteString)
, trFile :: !FilePath
}
deriving (Show, Eq, Ord)
@ -586,7 +587,7 @@ data InsertError
| MatchValPrecisionError !Natural !Natural
| InsertIOError !T.Text
| ParseError !T.Text
| ConversionError !T.Text
| ConversionError !B.ByteString
| LookupError !LookupSuberr !T.Text
| BalanceError !BalanceType !CurID ![RawSplit]
| IncomeError !DatePat

View File

@ -38,16 +38,18 @@ module Internal.Utils
)
where
import qualified Data.ByteString.Char8 as B8
import Data.Time.Format.ISO8601
import GHC.Real
import Internal.Types
import RIO
import qualified RIO.ByteString as B
import qualified RIO.List as L
import qualified RIO.Map as M
import qualified RIO.Text as T
import RIO.Time
import Text.Regex.TDFA
import Text.Regex.TDFA.Text
import Text.Regex.TDFA.ByteString
--------------------------------------------------------------------------------
-- dates
@ -172,7 +174,7 @@ toTx sc sa toSplits r@TxRecord {..} =
in Tx
{ txTags = []
, txDate = trDate
, txDescr = trDesc
, txDescr = decodeUtf8Lenient trDesc
, txSplits = fromSplit : ss_
}
where
@ -196,7 +198,7 @@ valMatches MatchVal {..} x
dateMatches :: MatchDate -> Day -> Bool
dateMatches md = (EQ ==) . compareDate md
otherMatches :: M.Map T.Text T.Text -> MatchOtherRe -> EitherErr Bool
otherMatches :: M.Map T.Text B.ByteString -> MatchOtherRe -> EitherErr Bool
otherMatches dict m = case m of
Val (Field n mv) -> valMatches mv =<< (readRational =<< lookup_ MatchNumeric n)
Desc (Field n (_, md)) -> (`matchMaybe` md) =<< lookup_ MatchText n
@ -226,13 +228,13 @@ resolveCurrency = resolveSplitField CurField
resolveSplitField :: SplitIDType -> TxRecord -> SplitAcnt -> EitherErrs T.Text
resolveSplitField t TxRecord {trOther = o} s = case s of
ConstT p -> Right p
LookupT f -> plural $ lookup_ f o
LookupT f -> decodeUtf8Lenient <$> plural (lookup_ f o)
MapT (Field f m) -> plural $ do
k <- lookup_ f o
lookup_ k m
lookup_ (decodeUtf8Lenient k) m
Map2T (Field (f1, f2) m) -> do
(k1, k2) <- concatEither2 (lookup_ f1 o) (lookup_ f2 o) (,)
plural $ lookup_ (k1, k2) m
plural $ lookup_ (decodeUtf8Lenient k1, decodeUtf8Lenient k2) m
where
lookup_ :: (Ord k, Show k) => k -> M.Map k v -> EitherErr v
lookup_ = lookupErr (SplitIDField t)
@ -242,46 +244,58 @@ lookupErr what k m = case M.lookup k m of
Just x -> Right x
_ -> Left $ LookupError what $ showT k
parseRational :: MonadFail m => (T.Text, Regex) -> T.Text -> m Rational
parseRational :: MonadFail m => (T.Text, Regex) -> B.ByteString -> m Rational
parseRational (pat, re) s = case matchGroupsMaybe s re of
[sign, x, ""] -> uncurry (*) <$> readWhole sign x
[sign, x, y] -> do
d <- readT "decimal" y
let p = 10 ^ T.length y
let p = 10 ^ B.length y
(k, w) <- readWhole sign x
return $ k * (w + d % p)
_ -> msg "malformed decimal"
where
readT what t = case readMaybe $ T.unpack t of
readT what t = case readWholeInteger t of
Just d -> return $ fromInteger d
_ -> msg $ T.unwords ["could not parse", what, t]
_ -> msg $ T.unwords ["could not parse", what, decodeUtf8Lenient t]
msg m =
fail $
T.unpack $
T.unwords [m, "-", keyVals [("pattern", pat), ("query", s)]]
T.unwords [m, "-", keyVals [("pattern", pat), ("query", decodeUtf8Lenient s)]]
readSign x
| x == "-" = return (-1)
| x == "+" || x == "" = return 1
| otherwise = msg $ T.append "invalid sign: " x
| otherwise = msg $ T.append "invalid sign: " $ decodeUtf8Lenient x
readWhole sign x = do
w <- readT "whole number" x
k <- readSign sign
return (k, w)
readRational :: T.Text -> EitherErr Rational
readRational s = case T.split (== '.') s of
readRational :: B.ByteString -> EitherErr Rational
readRational s = case B.split 46 s of -- 46 is '.'
[x] -> maybe err (return . fromInteger) $ readT x
[x, y] -> case (readT x, readT y) of
(Just x', Just y') ->
let p = 10 ^ T.length y
let p = 10 ^ B.length y
k = if x' >= 0 then 1 else -1
in return $ fromInteger x' + k * y' % p
_ -> err
_ -> err
where
readT = readMaybe . T.unpack
readT i = case B8.readInteger i of
Just (x, "") -> Just x
_ -> Nothing
err = Left $ ConversionError s
readWholeInteger :: B.ByteString -> Maybe Integer
readWholeInteger s = case B8.readInteger s of
Just (x, "") -> Just x
_ -> Nothing
-- readWholeInt :: B.ByteString -> Maybe Int
-- readWholeInt s = case B8.readInt s of
-- Just (x, "") -> Just x
-- _ -> Nothing
-- TODO smells like a lens
-- mapTxSplits :: (a -> b) -> Tx a -> Tx b
-- mapTxSplits f t@Tx {txSplits = ss} = t {txSplits = fmap f ss}
@ -332,7 +346,8 @@ showError other = (: []) $ case other of
ZeroLength -> "Zero repeat length"
ZeroRepeats -> "Zero repeats"
(RegexError re) -> T.append "could not make regex from pattern: " re
(ConversionError x) -> T.append "Could not convert to rational number: " x
(ConversionError x) ->
T.append "Could not convert to rational number: " $ decodeUtf8Lenient x
(InsertIOError msg) -> T.append "IO Error: " msg
(ParseError msg) -> T.append "Parse Error: " msg
(MatchValPrecisionError d p) ->
@ -375,7 +390,7 @@ showTx TxRecord {trDate = d, trAmount = v, trDesc = e, trFile = f} =
[ ("path", T.pack f)
, ("date", T.pack $ iso8601Show d)
, ("value", showT (fromRational v :: Float))
, ("description", doubleQuote e)
, ("description", doubleQuote $ decodeUtf8Lenient e)
]
showMatch :: MatchRe -> T.Text
@ -584,14 +599,14 @@ compileRegex groups pat = case res of
compile
(blankCompOpt {newSyntax = True})
(blankExecOpt {captureGroups = groups})
pat
(encodeUtf8 pat)
matchMaybe :: T.Text -> Regex -> EitherErr Bool
matchMaybe :: B.ByteString -> Regex -> EitherErr Bool
matchMaybe q re = case execute re q of
Right res -> Right $ isJust res
Left _ -> Left $ RegexError "this should not happen"
matchGroupsMaybe :: T.Text -> Regex -> [T.Text]
matchGroupsMaybe :: B.ByteString -> Regex -> [B.ByteString]
matchGroupsMaybe q re = case regexec re q of
Right Nothing -> []
Right (Just (_, _, _, xs)) -> xs

View File

@ -41,6 +41,7 @@ dependencies:
- recursion-schemes
- data-fix
- filepath
- bytestring
library:
source-dirs: lib/