hbs2/src/Data/Text/Fuzzy/Tokenize.hs

557 lines
20 KiB
Haskell

-- |
-- Module : Data.Text.Fuzzy.Tokenize
-- Copyright : Dmitry Zuikov 2020
-- License : MIT
--
-- Maintainer : dzuikov@gmail.com
-- Stability : experimental
-- Portability : unknown
--
-- The lightweight and multi-functional text tokenizer allowing different types of text tokenization
-- depending on it's settings.
--
-- It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
-- easier and sometimes faster than in case of usage mainstream parsing combinators or parser
-- generators.
--
-- The primary goal of this package is to parse unstructured text data, however it may be used for
-- parsing such data formats as CSV with ease.
--
-- Currently it supports the following types of entities: atoms, string literals (currently with the
-- minimal set of escaped characters), punctuation characters and delimeters.
--
-- == Examples
-- === Simple CSV-like tokenization
-- >>> tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
-- ["aaa "," bebeb "," qqq "]
--
-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
-- ["aaa "," bebeb "," qqq ","","","",""]
--
-- >>>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
-- [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
--
-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
-- [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
--
-- >>> let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
-- >>> tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
-- [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
--
-- >>> let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
-- >>> tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
-- [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
--
-- == Notes
--
-- === About the delimeter tokens
-- This type of tokens appears during a "delimited"
-- formats processing and disappears in results. Currenly
-- you will never see it unless normalization is turned off by 'nn' option.
--
-- The delimeters make sense in case of processing the CSV-like formats,
-- but in this case you probably need only values in results.
--
-- This behavior may be changed later. But right now delimeters seem pointless
-- in results. If you process some sort of grammar where delimeter character
-- is important, you may use punctuation instead, i.e:
--
-- >>> let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
-- >>> tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
-- ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
--
-- == Other
-- For CSV-like formats it makes sense to split text to lines first,
-- otherwise newline characters may cause to weird results
--
--
module Data.Text.Fuzzy.Tokenize ( TokenizeSpec
, IsToken(..)
, tokenize
, esc
, addEmptyFields
, emptyFields
, nn
, sq
, sqq
, noslits
, sl
, sr
, uw
, delims
, comment
, punct
, indent
, itabstops
, keywords
, eol
) where
import Prelude hiding (init)
import Control.Applicative
import Data.Map (Map)
import Data.Maybe (fromMaybe)
import Data.Monoid()
import Data.Set (Set)
import Data.Text (Text)
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Text as Text
import Control.Monad (when)
import Control.Monad.RWS
-- | Tokenization settings. Use mempty for an empty value
-- and construction functions for changing the settings.
--
data TokenizeSpec = TokenizeSpec { tsAtoms :: Set Text
, tsStringQQ :: Maybe Bool
, tsStringQ :: Maybe Bool
, tsNoSlits :: Maybe Bool
, tsLineComment :: Map Char Text
, tsDelims :: Set Char
, tsEol :: Maybe Bool
, tsStripLeft :: Maybe Bool
, tsStripRight :: Maybe Bool
, tsUW :: Maybe Bool
, tsNotNormalize :: Maybe Bool
, tsEsc :: Maybe Bool
, tsAddEmptyFields :: Maybe Bool
, tsPunct :: Set Char
, tsIndent :: Maybe Bool
, tsItabStops :: Maybe Int
, tsKeywords :: Set Text
}
deriving (Eq,Ord,Show)
instance Semigroup TokenizeSpec where
(<>) a b = TokenizeSpec { tsAtoms = tsAtoms b <> tsAtoms a
, tsStringQQ = tsStringQQ b <|> tsStringQQ a
, tsStringQ = tsStringQ b <|> tsStringQ a
, tsNoSlits = tsNoSlits b <|> tsNoSlits a
, tsLineComment = tsLineComment b <> tsLineComment a
, tsDelims = tsDelims b <> tsDelims a
, tsEol = tsEol b <|> tsEol a
, tsStripLeft = tsStripLeft b <|> tsStripLeft a
, tsStripRight = tsStripRight b <|> tsStripRight a
, tsUW = tsUW b <|> tsUW a
, tsNotNormalize = tsNotNormalize b <|> tsNotNormalize a
, tsEsc = tsEsc b <|> tsEsc a
, tsAddEmptyFields = tsAddEmptyFields b <|> tsAddEmptyFields a
, tsPunct = tsPunct b <> tsPunct a
, tsIndent = tsIndent b <|> tsIndent a
, tsItabStops = tsItabStops b <|> tsItabStops a
, tsKeywords = tsKeywords b <> tsKeywords a
}
instance Monoid TokenizeSpec where
mempty = TokenizeSpec { tsAtoms = mempty
, tsStringQQ = Nothing
, tsStringQ = Nothing
, tsNoSlits = Nothing
, tsLineComment = mempty
, tsDelims = mempty
, tsEol = Nothing
, tsStripLeft = Nothing
, tsStripRight = Nothing
, tsUW = Nothing
, tsNotNormalize = Nothing
, tsEsc = Nothing
, tsAddEmptyFields = Nothing
, tsPunct = mempty
, tsIndent = Nothing
, tsItabStops = Nothing
, tsKeywords = mempty
}
justTrue :: Maybe Bool -> Bool
justTrue (Just True) = True
justTrue _ = False
-- | Turns on EOL token generation
eol :: TokenizeSpec
eol = mempty { tsEol = pure True }
-- | Turn on character escaping inside string literals.
-- Currently the following escaped characters are
-- supported: [" ' \ t n r a b f v ]
esc :: TokenizeSpec
esc = mempty { tsEsc = pure True }
-- | Raise empty field tokens (note mkEmpty method)
-- when no tokens found before a delimeter.
-- Useful for processing CSV-like data in
-- order to distingush empty columns
addEmptyFields :: TokenizeSpec
addEmptyFields = mempty { tsAddEmptyFields = pure True }
-- | same as addEmptyFields
emptyFields :: TokenizeSpec
emptyFields = addEmptyFields
-- | Turns off token normalization. Makes the tokenizer
-- generate character stream. Useful for debugging.
nn :: TokenizeSpec
nn = mempty { tsNotNormalize = pure True }
-- | Turns on single-quoted string literals.
-- Character stream after '\'' character
-- will be proceesed as single-quoted stream,
-- assuming all delimeter, comment and other special
-- characters as a part of the string literal until
-- the next unescaped single quote character.
sq :: TokenizeSpec
sq = mempty { tsStringQ = pure True }
-- | Enable double-quoted string literals support
-- as 'sq' for single-quoted strings.
sqq :: TokenizeSpec
sqq = mempty { tsStringQQ = pure True }
-- | Disable separate string literals.
--
-- Useful when processed delimeted data (csv-like formats).
-- Normally, sequential text chunks are concatenated together,
-- but consequent text and string literal will produce the two
-- different tokens and it may cause weird results if data
-- is in csv-like format, i.e:
--
-- >>> tokenize (delims ":"<>emptyFields<>sq ) "aaa:bebe:'qq' aaa:next::" :: [Maybe Text]
-- [Just "aaa",Just "bebe",Just "qq",Just " aaa",Just "next",Nothing,Nothing]
--
-- look: "qq" and " aaa" are turned into two separate tokens that makes the result
-- of CSV processing looks improper, like it has an extra-column. This behavior may be
-- avoided using this option, if you don't need to distinguish text chunks and string
-- literals:
--
-- >>> tokenize (delims ":"<>emptyFields<>sq<>noslits) "aaa:bebe:'qq:foo' aaa:next::" :: [Maybe Text]
-- [Just "aaa",Just "bebe",Just "qq:foo aaa",Just "next",Nothing,Nothing]
--
noslits :: TokenizeSpec
noslits = mempty { tsNoSlits = pure True }
-- | Specify the list of delimers (characters)
-- to split the character stream into fields. Useful for CSV-like separated formats. Support for
-- empty fields in token stream may be enabled by 'addEmptyFields' function
delims :: String -> TokenizeSpec
delims s = mempty { tsDelims = Set.fromList s }
-- | Strip spaces on left side of a token.
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
sl :: TokenizeSpec
sl = mempty { tsStripLeft = pure True }
-- | Strip spaces on right side of a token.
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
sr :: TokenizeSpec
sr = mempty { tsStripRight = pure True }
-- | Strips spaces on right and left sides and transforms multiple spaces into the one.
-- Name origins from unwords . words
--
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
uw :: TokenizeSpec
uw = mempty { tsUW = pure True }
-- | Specify the line comment prefix.
-- All text after the line comment prefix will
-- be ignored until the newline character appearance.
-- Multiple line comments are supported.
comment :: Text -> TokenizeSpec
comment s = mempty { tsLineComment = cmt }
where
cmt = case Text.uncons s of
Just (p,su) -> Map.singleton p su
Nothing -> mempty
-- | Specify the punctuation characters.
-- Any punctuation character is handled as a separate
-- token.
-- Any token will be breaked on a punctiation character.
--
-- Useful for handling ... er... punctuaton, like
--
-- >> function(a,b)
--
-- or
--
-- >> (apply function 1 2 3)
--
--
-- >>> tokenize spec "(apply function 1 2 3)" :: [Text]
-- ["(","apply","function","1","2","3",")"]
--
punct :: Text -> TokenizeSpec
punct s = mempty { tsPunct = Set.fromList (Text.unpack s) }
-- | Specify the keywords list.
-- Each keyword will be threated as a separate token.
keywords :: [Text] -> TokenizeSpec
keywords s = mempty { tsKeywords = Set.fromList s }
-- | Enable identation support
indent :: TokenizeSpec
indent = mempty { tsIndent = Just True }
-- | Set tab expanding multiplier
-- i.e. each tab extends into n spaces before processing.
-- It also turns on the indentation. Only the tabs at the beginning of the string are expanded,
-- i.e. before the first non-space character appears.
itabstops :: Int -> TokenizeSpec
itabstops n = mempty { tsIndent = Just True, tsItabStops = pure n }
newtype TokenizeM w a = TokenizeM (RWS TokenizeSpec w () a)
deriving( Applicative
, Functor
, MonadReader TokenizeSpec
, MonadWriter w
, MonadState ()
, Monad
)
data Token = TChar Char
| TSChar Char
| TPunct Char
| TText Text
| TSLit Text
| TKeyword Text
| TEmpty
| TDelim
| TIndent Int
| TEol
deriving (Eq,Ord,Show)
-- | Typeclass for token values.
-- Note, that some tokens appear in results
-- only when 'nn' option is set, i.e. sequences
-- of characters turn out to text tokens or string literals
-- and delimeter tokens are just removed from the
-- results
class IsToken a where
-- | Create a character token
mkChar :: Char -> a
-- | Create a string literal character token
mkSChar :: Char -> a
-- | Create a punctuation token
mkPunct :: Char -> a
-- | Create a text chunk token
mkText :: Text -> a
-- | Create a string literal token
mkStrLit :: Text -> a
-- | Create a keyword token
mkKeyword :: Text -> a
-- | Create an empty field token
mkEmpty :: a
-- | Create a delimeter token
mkDelim :: a
mkDelim = mkEmpty
-- | Creates an indent token
mkIndent :: Int -> a
mkIndent = const mkEmpty
-- | Creates an EOL token
mkEol :: a
mkEol = mkEmpty
instance IsToken (Maybe Text) where
mkChar = pure . Text.singleton
mkSChar = pure . Text.singleton
mkPunct = pure . Text.singleton
mkText = pure
mkStrLit = pure
mkKeyword = pure
mkEmpty = Nothing
instance IsToken Text where
mkChar = Text.singleton
mkSChar = Text.singleton
mkPunct = Text.singleton
mkText = id
mkStrLit = id
mkKeyword = id
mkEmpty = ""
-- | Tokenize a text
tokenize :: IsToken a => TokenizeSpec -> Text -> [a]
tokenize s t = map tr t1
where
t1 = tokenize' s t
tr (TChar c) = mkChar c
tr (TSChar c) = mkSChar c
tr (TText c) = mkText c
tr (TSLit c) = mkStrLit c
tr (TKeyword c) = mkKeyword c
tr TEmpty = mkEmpty
tr (TPunct c) = mkPunct c
tr TDelim = mkDelim
tr (TIndent n) = mkIndent n
tr TEol = mkEol
execTokenizeM :: TokenizeM [Token] a -> TokenizeSpec -> [Token]
execTokenizeM (TokenizeM m) spec =
let (_,w) = execRWS m spec () in norm w
where norm x | justTrue (tsNotNormalize spec) = x
| otherwise = normalize spec x
tokenize' :: TokenizeSpec -> Text -> [Token]
tokenize' spec txt = execTokenizeM (root' txt) spec
where
r = spec
noIndent = not doIndent
doIndent = justTrue (tsIndent r)
eolOk = justTrue (tsEol r)
root' x = scanIndent x >>= root
root ts = do
case Text.uncons ts of
Nothing -> pure ()
Just ('\n', rest) | doIndent -> raiseEol >> root' rest
Just (c, rest) | Set.member c (tsDelims r) -> tell [TDelim] >> root rest
Just ('\'', rest) | justTrue (tsStringQ r) -> scanQ '\'' rest
Just ('"', rest) | justTrue (tsStringQQ r) -> scanQ '"' rest
Just (c, rest) | Map.member c (tsLineComment r) -> scanComment (c,rest)
Just (c, rest) | Set.member c (tsPunct r) -> tell [TPunct c] >> root rest
Just (c, rest) | otherwise -> tell [TChar c] >> root rest
raiseEol | eolOk = tell [TEol]
| otherwise = pure ()
expandSpace ' ' = 1
expandSpace '\t' = (fromMaybe 8 (tsItabStops r))
expandSpace _ = 0
scanIndent x | noIndent = pure x
| otherwise = do
let (ss,as) = Text.span (\c -> c == ' ' || c == '\t') x
tell [ TIndent (sum (map expandSpace (Text.unpack ss))) ]
pure as
scanComment (c,rest) = do
suff <- Map.lookup c <$> asks tsLineComment
case suff of
Just t | Text.isPrefixOf t rest -> do
root $ Text.dropWhile ('\n' /=) rest
_ -> tell [TChar c] >> root rest
scanQ q ts = do
case Text.uncons ts of
Nothing -> root ts
Just ('\\', rest) | justTrue (tsEsc r) -> unesc (scanQ q) rest
| otherwise -> tell [tsChar '\\'] >> scanQ q rest
Just (c, rest) | c == q -> root rest
| otherwise -> tell [tsChar c] >> scanQ q rest
unesc f ts =
case Text.uncons ts of
Nothing -> f ts
Just ('"', rs) -> tell [tsChar '"' ] >> f rs
Just ('\'', rs) -> tell [tsChar '\''] >> f rs
Just ('\\', rs) -> tell [tsChar '\\'] >> f rs
Just ('t', rs) -> tell [tsChar '\t'] >> f rs
Just ('n', rs) -> tell [tsChar '\n'] >> f rs
Just ('r', rs) -> tell [tsChar '\r'] >> f rs
Just ('a', rs) -> tell [tsChar '\a'] >> f rs
Just ('b', rs) -> tell [tsChar '\b'] >> f rs
Just ('f', rs) -> tell [tsChar '\f'] >> f rs
Just ('v', rs) -> tell [tsChar '\v'] >> f rs
Just (_, rs) -> f rs
tsChar c | justTrue (tsNoSlits spec) = TChar c
| otherwise = TSChar c
newtype NormStats = NormStats { nstatBeforeDelim :: Int }
normalize :: TokenizeSpec -> [Token] -> [Token]
normalize spec tokens = snd $ execRWS (go tokens) () init
where
go [] = addEmptyField
go s@(TIndent _ : _) = do
let (iis, rest') = List.span isIndent s
tell [TIndent (sum [k | TIndent k <- iis])]
go rest'
go (TChar c0 : cs) = do
let (n,ns) = List.span isTChar cs
succStat
let chunk = eatSpaces $ Text.pack (c0 : [ c | TChar c <- n])
let kw = Set.member chunk (tsKeywords spec)
tell [ if kw then TKeyword chunk else TText chunk ]
go ns
go (TSChar x : xs) = do
let (n,ns) = List.span isTSChar xs
succStat
tell [ TSLit $ Text.pack (x : [ c | TSChar c <- n]) ]
go ns
go (TDelim : xs) = do
addEmptyField
pruneStat
go xs
go (TPunct c : xs) = do
tell [ TPunct c ]
succStat
go xs
go (x:xs) = tell [x] >> go xs
succStat = do
modify (\x -> x { nstatBeforeDelim = succ (nstatBeforeDelim x)})
pruneStat = do
modify (\x -> x { nstatBeforeDelim = 0 } )
addEmptyField = do
ns <- gets nstatBeforeDelim
when (ns == 0 && justTrue (tsAddEmptyFields spec) ) $ do
tell [ TEmpty ]
isTChar (TChar _) = True
isTChar _ = False
isTSChar (TSChar _) = True
isTSChar _ = False
isIndent (TIndent _) = True
isIndent _ = False
init = NormStats { nstatBeforeDelim = 0 }
eatSpaces s | sboth = Text.strip s
| sLonly = Text.stripStart s
| sRonly = Text.stripEnd s
| sWU = (Text.unwords . Text.words) s
| otherwise = s
where sboth = justTrue (tsStripLeft spec) && justTrue (tsStripRight spec)
sLonly = justTrue (tsStripLeft spec) && not (justTrue (tsStripRight spec))
sRonly = not (justTrue (tsStripLeft spec)) && justTrue (tsStripRight spec)
sWU = justTrue (tsUW spec)