Squashed 'miscellaneous/fuzzy-parse/' content from commit a834b152e

git-subtree-dir: miscellaneous/fuzzy-parse
git-subtree-split: a834b152e29d632c816eefe117036e5d9330bd03
This commit is contained in:
voidlizard 2024-10-07 05:06:03 +03:00
commit cf85d2df2a
26 changed files with 1854 additions and 0 deletions

5
.envrc Normal file
View File

@ -0,0 +1,5 @@
if [ -f .envrc.local ]; then
source_env .envrc.local
fi
use flake

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
*.swp
dist-newstyle/
Setup.hs
.direnv
.hbs2-git/

10
CHANGELOG.md Normal file
View File

@ -0,0 +1,10 @@
# Revision history for fuzzy-parse
## 0.1.2.0
- Techical release
- Added some missed things
## 0.1.0.0 -- YYYY-mm-dd
* First version. Released on an unsuspecting world.

20
LICENSE Normal file
View File

@ -0,0 +1,20 @@
Copyright (c) 2019 Dmitry Zuikov
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

131
README.markdown Normal file
View File

@ -0,0 +1,131 @@
# About
# Data.Text.Fuzzy.Tokenize
The lightweight and multi-functional text tokenizer allowing different types of text tokenization
depending on it's settings.
It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
easier and sometimes faster than in case of usage mainstream parsing combinators or parser
generators.
The primary goal of this package is to parse unstructured text data, however it may be used for
parsing such data formats as CSV with ease.
Currently it supports the following types of entities: atoms, string literals (currently with the
minimal set of escaped characters), punctuation characters and delimeters.
## Examples
### Simple CSV-like tokenization
```haskell
tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
["aaa "," bebeb "," qqq "]
```
```haskell
tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
["aaa "," bebeb "," qqq ","","","",""]
```
```haskell
tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
[Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
```
```haskell
tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
[Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
```
```haskell
let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
[Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
```
```haskell
let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
[Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
```
### Primitive lisp-like language
```haskell
{-# LANGUAGE QuasiQuotes, ExtendedDefaultRules #-}
import Text.InterpolatedString.Perl6 (q)
import Data.Text.Fuzzy.Tokenize
data TTok = TChar Char
| TSChar Char
| TPunct Char
| TText Text
| TStrLit Text
| TKeyword Text
| TEmpty
deriving(Eq,Ord,Show)
instance IsToken TTok where
mkChar = TChar
mkSChar = TSChar
mkPunct = TPunct
mkText = TText
mkStrLit = TStrLit
mkKeyword = TKeyword
mkEmpty = TEmpty
main = do
let spec = delims " \n\t" <> comment ";"
<> punct "{}()[]<>"
<> sq <> sqq
<> uw
<> keywords ["define","apply","+"]
let code = [q|
(define add (a b ) ; define simple function
(+ a b) )
(define r (add 10 20))
|]
let toks = tokenize spec code :: [TTok]
print toks
```
## Notes
### About the delimeter tokens
This type of tokens appears during a "delimited" formats processing and disappears in results.
Currenly you will never see it unless normalization is turned off by 'nn' option.
The delimeters make sense in case of processing the CSV-like formats, but in this case you probably
need only values in results.
This behavior may be changed later. But right now delimeters seem pointless in results. If you
process some sort of grammar where delimeter character is important, you may use punctuation
instead, i.e:
```haskell
let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
```
### Other
For CSV-like formats it makes sense to split text to lines first, otherwise newline characters may
cause to weird results
# Authors
This library is written and maintained by Dmitry Zuikov, dzuikov@gmail.com

12
TODO Normal file
View File

@ -0,0 +1,12 @@
- [ ] TODO: Tests for Data.Text.Fuzzy.Section
- [ ] TODO: haddock for Data.Text.Fuzzy.Section
- [~] TODO: Tests
- [+] TODO: Freeze dependencies versions
- [ ] TODO: Version number
- [+] TODO: Haddocks
- [ ] TODO: Tokenizer: Identation support
- [ ] TODO: Tokenizer: Block comments support

6
cabal.project Normal file
View File

@ -0,0 +1,6 @@
packages: *.cabal
allow-newer: all
tests: True

61
flake.lock Normal file
View File

@ -0,0 +1,61 @@
{
"nodes": {
"flake-utils": {
"locked": {
"lastModified": 1644229661,
"narHash": "sha256-1YdnJAsNy69bpcjuoKdOYQX0YxZBiCYZo4Twxerqv7k=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "3cecb5b042f7f209c56ffd8371b2711a290ec797",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"haskell-flake-utils": {
"inputs": {
"flake-utils": "flake-utils"
},
"locked": {
"lastModified": 1707809372,
"narHash": "sha256-wfTL9PlCSOqSSyU4eenFFI7pHrV21gba4GEILnI4nAU=",
"owner": "ivanovs-4",
"repo": "haskell-flake-utils",
"rev": "3cbdc5d6093e8b4464ae64097e0c8c61e4414ff2",
"type": "github"
},
"original": {
"owner": "ivanovs-4",
"repo": "haskell-flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1727089097,
"narHash": "sha256-ZMHMThPsthhUREwDebXw7GX45bJnBCVbfnH1g5iuSPc=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "568bfef547c14ca438c56a0bece08b8bb2b71a9c",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"haskell-flake-utils": "haskell-flake-utils",
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

69
flake.nix Normal file
View File

@ -0,0 +1,69 @@
{
description = "Haskell cabal package";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
haskell-flake-utils.url = "github:ivanovs-4/haskell-flake-utils";
# another-simple-haskell-flake.url = "something";
# some-cabal-pkg.url = "github:example/some-cabal-pkg";
# some-cabal-pkg.flake = false;
};
outputs = { self, nixpkgs, haskell-flake-utils, ... }@inputs:
haskell-flake-utils.lib.simpleCabal2flake {
inherit self nixpkgs;
# systems = [ "x86_64-linux" ];
# DON'T FORGET TO PUT YOUR PACKAGE NAME HERE, REMOVING `throw`
name = "fuzzy-parse";
shellExtBuildInputs = {pkgs}: with pkgs; [
haskellPackages.haskell-language-server
];
# Wether to build hoogle in the default shell
shellWithHoogle = true;
## Optional parameters follow
# nixpkgs config
# config = { };
# Add another haskell flakes as requirements
# haskellFlakes = [ inputs.another-simple-haskell-flake ];
# Use this to load other flakes overlays to supplement nixpkgs
# preOverlays = [ ];
# Pass either a function or a file
# preOverlay = ./overlay.nix;
# Override haskell packages
# hpPreOverrides = { pkgs }: new: old:
# with pkgs.haskell.lib; with haskell-flake-utils.lib;
# tunePackages pkgs old {
# some-haskellPackages-package = [ dontHaddock ];
# } // {
# some-cabal-pkg = ((jailbreakUnbreak pkgs) (dontCheck (old.callCabal2nix "some-cabal-pkg" inputs.some-cabal-pkg {})));
# };
# Arguments for callCabal2nix
# cabal2nixArgs = {pkgs}: {
# };
# Maps to the devShell output. Pass in a shell.nix file or function
# shell = ./shell.nix
# Additional build intputs of the default shell
# shellExtBuildInputs = {pkgs}: with pkgs; [
# haskellPackages.haskell-language-server
# ];
# Wether to build hoogle in the default shell
# shellWithHoogle = true;
};
}

160
fuzzy-parse.cabal Normal file
View File

@ -0,0 +1,160 @@
cabal-version: 3.0
name: fuzzy-parse
version: 0.1.3.1
synopsis: Tools for processing unstructured text data
description:
The lightweight and easy to use functions for text tokenizing and parsing. It aimed for
parsing mostly unstructured data, but the structured formats may be parsed as well.
It may be used in different sutiations, for DSL, tex markups or even for parsing simple
grammars easier and sometimes faster than in case of usage mainstream parsing combinators
or parser generators.
See the README.markdown, examples and modules documentation for more.
license: MIT
license-file: LICENSE
author: Dmitry Zuikov
maintainer: dzuikov@gmail.com
category: Text, Parsing
extra-source-files: CHANGELOG.md
homepage: https://github.com/hexresearch/fuzzy-parse
bug-reports: https://github.com/hexresearch/fuzzy-parse/issues
extra-source-files:
README.markdown
common shared-properties
default-language: GHC2021
default-extensions:
ApplicativeDo
, BangPatterns
, BlockArguments
, ConstraintKinds
, DataKinds
, DeriveDataTypeable
, DeriveGeneric
, DerivingStrategies
, DerivingVia
, ExtendedDefaultRules
, FlexibleContexts
, FlexibleInstances
, GADTs
, GeneralizedNewtypeDeriving
, ImportQualifiedPost
, LambdaCase
, MultiParamTypeClasses
, OverloadedStrings
, QuasiQuotes
, RecordWildCards
, ScopedTypeVariables
, StandaloneDeriving
, TemplateHaskell
, TupleSections
, TypeApplications
, TypeFamilies
library
import: shared-properties
ghc-options:
-Wall
-fno-warn-type-defaults
-O2
"-with-rtsopts=-N4 -A64m -AL256m -I0"
exposed-modules: Data.Text.Fuzzy.Tokenize
, Data.Text.Fuzzy.Dates
, Data.Text.Fuzzy.Section
, Data.Text.Fuzzy.SExp
, Data.Text.Fuzzy.Attoparsec.Day
, Data.Text.Fuzzy.Attoparsec.Month
build-depends: base
, attoparsec
, containers
, mtl
, prettyprinter
, safe
, streaming
, scientific
, text
, time
, microlens-platform
, uniplate
, unliftio
, unordered-containers
, timeit
hs-source-dirs: src
executable fuzzy-sexp-parse
import: shared-properties
default-language: GHC2021
ghc-options:
-Wall
-fno-warn-type-defaults
-O2
main-is: FuzzySexpParse.hs
hs-source-dirs: misc
build-depends: base, fuzzy-parse
, containers
, hspec
, hspec-discover
, interpolatedstring-perl6
, text
, mtl
, streaming
, transformers
, exceptions
, uniplate
, microlens-platform
, safe
, timeit
, prettyprinter
test-suite fuzzy-parse-test
import: shared-properties
default-language: GHC2021
type: exitcode-stdio-1.0
main-is: Spec.hs
other-modules: FuzzyParseSpec
hs-source-dirs: test
build-depends: base, fuzzy-parse
, containers
, hspec
, hspec-discover
, interpolatedstring-perl6
, text
, mtl
, streaming
, transformers
, exceptions
, uniplate
, microlens-platform
, safe
, timeit
build-tool-depends: hspec-discover:hspec-discover == 2.*

30
misc/FuzzySexpParse.hs Normal file
View File

@ -0,0 +1,30 @@
module Main where
import Data.Text.Fuzzy.SExp
import Data.Text.IO qualified as IO
import Data.Text qualified as Text
import Data.Either
import System.TimeIt
import Control.Monad.Except
import Data.Functor
import Data.Function
import Data.Fixed
import Prettyprinter
import System.IO
main :: IO ()
main = do
s <- IO.getContents
(tt,toks) <- timeItT do
pure (tokenizeSexp s)
(pt,top) <- timeItT do
runExceptT (parseTop @() s) <&> either (error.show) id
print (vcat (fmap pretty top))
hPrint stderr $ pretty (Text.length s) <+> "chars, parsed in" <+> viaShow (realToFrac pt :: Fixed E6)

0
nix/derivations/.gitignore vendored Normal file
View File

7
nix/pkgs.json Normal file
View File

@ -0,0 +1,7 @@
{
"url": "https://github.com/NixOS/nixpkgs.git",
"rev": "5cf0de2485efeccc307692eedadbb2d9bfdc7013",
"date": "2020-06-04T17:30:37+08:00",
"sha256": "07axrr50nlmnvba5ja2ihzjwczi66znak57bhcz472w22w7m3sd1",
"fetchSubmodules": false
}

5
nix/pkgs.nix Normal file
View File

@ -0,0 +1,5 @@
import ((import <nixpkgs> {}).fetchFromGitHub {
owner = "NixOS";
repo = "nixpkgs";
inherit (builtins.fromJSON (builtins.readFile ./pkgs.json)) rev sha256;
})

29
nix/release.nix Normal file
View File

@ -0,0 +1,29 @@
let
pkgs = import ./pkgs.nix { inherit config;
};
lib = pkgs.haskell.lib;
config = {
packageOverrides = pkgs: rec {
haskellPackages = pkgs.haskellPackages.override { overrides = haskOverrides; };
};
};
gitignore = pkgs.callPackage (pkgs.fetchFromGitHub {
owner = "siers";
repo = "nix-gitignore";
rev = "ce0778ddd8b1f5f92d26480c21706b51b1af9166";
sha256 = "1d7ab78i2k13lffskb23x8b5h24x7wkdmpvmria1v3wb9pcpkg2w";
}) {};
ignore = gitignore.gitignoreSourceAux ''
.stack-work
dist
dist-newstyle
.ghc.environment*
'';
haskOverrides = new: old:
let overrides = lib.packagesFromDirectory { directory = ./derivations; } new old;
in overrides;
in rec {
inherit pkgs;
packages = { inherit (pkgs.haskellPackages) fuzzy-parse;
};
}

3
nix/update-nixpkgs.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/env nix-shell
#! nix-shell -i bash -p nix-prefetch-git
nix-prefetch-git https://github.com/NixOS/nixpkgs.git | tee pkgs.json

10
scripts/upload-docs.sh Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
set -e
dir=$(mktemp -d dist-docs.XXXXXX)
trap 'rm -r "$dir"' EXIT
# assumes cabal 2.4 or later
cabal v2-haddock --builddir="$dir" --haddock-for-hackage --enable-doc
cabal upload -d --publish $dir/*-docs.tar.gz

View File

@ -0,0 +1,97 @@
module Data.Text.Fuzzy.Attoparsec.Day ( dayDMY
, dayYMD
, dayYYYYMMDD
, dayDMonY
, day
) where
import Data.List (zipWith)
import Control.Applicative ((<|>))
import Data.Attoparsec.Text (Parser,decimal,digit,count,satisfy,inClass,skipWhile)
import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
import qualified Data.Char as Char
import qualified Data.Text as Text
day :: Parser Day
day = dayDMonY <|> dayYYYYMMDD <|> dayYMD <|> dayDMY
skipDelim :: Parser ()
skipDelim = skipWhile (inClass " ./-")
dayDMY :: Parser Day
dayDMY = do
d <- decimal :: Parser Int
skipDelim
m <- decimal :: Parser Int
skipDelim
y' <- decimal :: Parser Integer
maybe (fail "bad date format") pure (makeDay y' m d)
dayYMD :: Parser Day
dayYMD = do
y' <- decimal :: Parser Integer
skipDelim
m <- decimal :: Parser Int
skipDelim
d <- decimal :: Parser Int
maybe (fail "bad date format") pure (makeDay y' m d)
dayYYYYMMDD :: Parser Day
dayYYYYMMDD = do
y <- fromIntegral . num n4 . map o <$> count 4 digit
m <- num n2 . map o <$> count 2 digit
d <- num n2 . map o <$> count 2 digit
maybe (fail "bad date format") pure (makeDay y m d)
where n4 = [1000,100,10,1]
n2 = [10,1]
o x = Char.ord x - Char.ord '0'
num n x = sum $ zipWith (*) x n
dayDMonY :: Parser Day
dayDMonY = do
d <- decimal :: Parser Int
skipDelim
m <- pMon
skipDelim
y <- decimal :: Parser Integer
maybe (fail "bad date format") pure (makeDay y m d)
where
pMon :: Parser Int
pMon = do
txt <- Text.toUpper . Text.pack <$> count 3 (satisfy Char.isLetter)
case txt of
"JAN" -> pure 1
"FEB" -> pure 2
"MAR" -> pure 3
"APR" -> pure 4
"MAY" -> pure 5
"JUN" -> pure 6
"JUL" -> pure 7
"AUG" -> pure 8
"SEP" -> pure 9
"OCT" -> pure 10
"NOV" -> pure 11
"DEC" -> pure 12
_ -> fail "bad month name"
makeYear :: Integer -> Maybe Integer
makeYear y' = if y < 1900 && y' < 99
then Nothing
else pure y
where
y = if y' < 50
then y' + 2000
else (if y' >= 50 && y' <= 99
then y' + 1900
else y' )
makeDay :: Integer -> Int -> Int -> Maybe Day
makeDay y m d | m <= 12 && m > 0 =
makeYear y >>= \yyyy -> if d <= gregorianMonthLength yyyy m
then pure $ fromGregorian yyyy m d
else Nothing
| otherwise = Nothing

View File

@ -0,0 +1,46 @@
module Data.Text.Fuzzy.Attoparsec.Month ( fuzzyMonth, fuzzyMonthFromText
) where
import Control.Applicative ((<|>))
import Data.Attoparsec.Text (Parser,decimal,digit,letter,many1,parseOnly)
import Data.Map (Map)
import Data.Maybe
import Data.Text (Text)
import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
import qualified Data.Char as Char
import qualified Data.Map as Map
import qualified Data.Text as Text
fuzzyMonth :: Parser Int
fuzzyMonth = pMonthNum <|> pMonth
fuzzyMonthFromText :: Text -> Maybe Int
fuzzyMonthFromText = either (const Nothing) Just . parseOnly fuzzyMonth
pMonthNum :: Parser Int
pMonthNum = do
n <- decimal
if n >= 1 && n <= 13
then pure n
else fail "invalid months num"
pMonth :: Parser Int
pMonth = do
mo <- many1 (Char.toLower <$> letter)
maybe (fail "invalid month name") pure (Map.lookup mo months)
where
months :: Map String Int
months = Map.fromList [ ("jan", 1), ("january" , 1)
, ("feb", 2), ("febuary" , 2)
, ("mar", 3), ("march" , 3)
, ("apr", 4), ("april" , 4)
, ("may", 5), ("may" , 5)
, ("jun", 6), ("june" , 6)
, ("jul", 7), ("july" , 7)
, ("aug", 8), ("august" , 8)
, ("sep", 9), ("september", 9)
, ("oct", 10), ("october" , 10)
, ("nov", 11), ("november" , 11)
, ("dec", 12), ("december" , 12)
]

View File

@ -0,0 +1,46 @@
-- |
-- Module : Data.Text.Fuzzy.Dates
-- Copyright : Dmitry Zuikov 2020
-- License : MIT
--
-- Maintainer : dzuikov@gmail.com
-- Stability : experimental
-- Portability : unknown
--
-- Dates fuzzy parsing.
-- Supports a number of dates format and tries to recover
-- the incomplete dates from text, with use of some
-- reasonable assumptions. Does not support locales,
-- i.e assums only English for dates yet.
--
-- == Examples
--
-- > parseMaybeDay "01.01.1979"
-- > Just 1979-01-01
-- > parseMaybeDay "01.01.01"
-- > Just 2001-01-01
-- > parseMaybeDay "13/01/2019"
-- > Just 2019-01-13
-- > parseMaybeDay "2019-12-1"
-- > Just 2019-12-01
-- > parseMaybeDay "21-feb-79"
-- > Just 1979-02-21
-- > parseMaybeDay "21-feb-01"
-- > Just 2001-02-21
-- > parseMaybeDay "29feb04"
-- > Just 2004-02-29
-- > parseMaybeDay "21feb28"
-- > Just 2028-02-21
module Data.Text.Fuzzy.Dates where
import Data.Attoparsec.Text (parseOnly)
import Data.Either (either)
import Data.Text.Fuzzy.Attoparsec.Day
import Data.Text (Text)
import Data.Time.Calendar
-- | Tries to parse a date from the text.
parseMaybeDay :: Text -> Maybe Day
parseMaybeDay s = either (const Nothing) pure (parseOnly day s)

378
src/Data/Text/Fuzzy/SExp.hs Normal file
View File

@ -0,0 +1,378 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE ExtendedDefaultRules #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ImportQualifiedPost #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE PatternSynonyms #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE TemplateHaskell #-}
module Data.Text.Fuzzy.SExp where
import Data.Text (Text)
import Control.Applicative
import Control.Monad
import Data.Function
import Data.Functor
import Data.Text.Fuzzy.Tokenize
import Control.Monad.Reader
import Data.Typeable
import Control.Monad.Except
import Control.Monad.RWS
import Data.Maybe
import Data.Char (isSpace,digitToInt)
import Data.Generics.Uniplate.Data()
import Safe
import Data.Data
import GHC.Generics
import Lens.Micro.Platform
import Data.Text qualified as Text
import Data.Coerce
import Data.Scientific
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HM
import Prettyprinter hiding (braces,list)
import Streaming.Prelude qualified as S
data TTok = TChar Char
| TSChar Char
| TPunct Char
| TText Text
| TStrLit Text
| TKeyword Text
| TEmpty
| TIndent Int
deriving stock (Eq,Ord,Show,Data,Generic)
instance IsToken TTok where
mkChar = TChar
mkSChar = TSChar
mkPunct = TPunct
mkText = TText
mkStrLit = TStrLit
mkKeyword = TKeyword
mkEmpty = TEmpty
mkIndent = TIndent
newtype C0 = C0 (Maybe Int)
deriving stock (Eq,Ord,Show,Data,Typeable,Generic)
data SExpParseError =
ParensOver C0
| ParensUnder C0
| ParensUnmatched C0
| SyntaxError C0
deriving stock (Show,Typeable)
data NumType =
NumInteger Integer
| NumDouble Scientific
deriving stock (Eq,Ord,Show,Data,Generic)
class Monoid c => ForMicroSexp c where
instance Monoid C0 where
mempty = C0 Nothing
instance Semigroup C0 where
(<>) (C0 a) (C0 b) = C0 (b <|> a)
instance ForMicroSexp C0 where
instance ForMicroSexp () where
data MicroSexp c =
List_ c [MicroSexp c]
| Symbol_ c Text
| String_ c Text
| Number_ c NumType
| Boolean_ c Bool
deriving stock (Show,Data,Generic)
pattern List :: ForMicroSexp c => [MicroSexp c] -> MicroSexp c
pattern List xs <- List_ _ xs where
List xs = List_ mempty xs
pattern Symbol :: ForMicroSexp c => Text -> MicroSexp c
pattern Symbol xs <- Symbol_ _ xs where
Symbol xs = Symbol_ mempty xs
pattern String :: ForMicroSexp c => Text -> MicroSexp c
pattern String x <- String_ _ x where
String x = String_ mempty x
pattern Number :: ForMicroSexp c => NumType -> MicroSexp c
pattern Number n <- Number_ _ n where
Number n = Number_ mempty n
pattern Boolean :: ForMicroSexp c => Bool -> MicroSexp c
pattern Boolean b <- Boolean_ _ b where
Boolean b = Boolean_ mempty b
{-# COMPLETE List, Symbol, String, Number, Boolean #-}
contextOf :: Lens (MicroSexp c) (MicroSexp c) c c
contextOf = lens g s
where
s sexp c = case sexp of
List_ _ a -> List_ c a
Symbol_ _ a -> Symbol_ c a
String_ _ a -> String_ c a
Number_ _ a -> Number_ c a
Boolean_ _ a -> Boolean_ c a
g = \case
List_ c _ -> c
Symbol_ c _ -> c
String_ c _ -> c
Number_ c _ -> c
Boolean_ c _ -> c
nil :: forall c . ForMicroSexp c => MicroSexp c
nil = List []
symbol :: forall c . ForMicroSexp c => Text -> MicroSexp c
symbol = Symbol
str :: forall c . ForMicroSexp c => Text -> MicroSexp c
str = String
newtype SExpEnv =
SExpEnv
{ sexpTranslate :: Bool
}
data SExpState =
SExpState
{ _sexpLno :: Int
, _sexpBraces :: [Char]
}
makeLenses 'SExpState
defEnv :: SExpEnv
defEnv = SExpEnv True
newtype SExpM m a = SExpM { fromSexpM :: RWST SExpEnv () SExpState m a }
deriving newtype
( Applicative
, Functor
, Monad
, MonadState SExpState
, MonadReader SExpEnv
, MonadTrans
)
instance MonadError SExpParseError m => MonadError SExpParseError (SExpM m) where
throwError = lift . throwError
catchError w = catchError (coerce $ fromSexpM w)
tokenizeSexp :: Text -> [TTok]
tokenizeSexp txt = do
let spec = delims " \r\t" <> comment ";"
<> punct "'{}()[]\n"
<> sqq
<> uw
tokenize spec txt
runSexpM :: Monad m => SExpM m a -> m a
runSexpM f = evalRWST (fromSexpM f) defEnv (SExpState 0 []) <&> fst
parseSexp :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m (MicroSexp c)
parseSexp txt = do
(s, _) <- runSexpM do
(s,rest) <- sexp (tokenizeSexp txt)
checkBraces
pure (s,rest)
pure s
checkBraces :: (MonadError SExpParseError m) => SExpM m ()
checkBraces = do
braces <- gets (view sexpBraces)
unless (null braces) $ raiseWith ParensUnder
succLno :: (MonadError SExpParseError m) => SExpM m ()
succLno = modify (over sexpLno succ)
parseTop :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m [MicroSexp c]
parseTop txt = do
let tokens = tokenizeSexp txt
S.toList_ $ runSexpM do
flip fix (mempty,tokens) $ \next -> \case
(acc, []) -> do
emit acc
(acc, TPunct '\n' : rest) -> do
succLno
emit acc
next (mempty,rest)
(acc, rest) -> do
(s, xs) <- sexp rest
next (acc <> [s],xs)
where
emit [] = pure ()
emit wtf = case wtf of
[List one] -> lift $ S.yield (List one)
xs -> lift $ S.yield (List xs)
sexp :: (ForMicroSexp c, MonadError SExpParseError m) => [TTok] -> SExpM m (MicroSexp c, [TTok])
sexp s = case s of
[] -> do
checkBraces
pure (nil, mempty)
(TText l : w) -> (,w) <$> trNum (Symbol l)
(TStrLit l : w) -> pure (String l, w)
-- so far ignored
(TPunct '\'' : rest) -> sexp rest
(TPunct '\n' : rest) -> succLno >> sexp rest
(TPunct c : rest) | isSpace c -> sexp rest
(TPunct c : rest) | isBrace c ->
maybe (pure (nil, rest)) (`list` rest) (closing c)
| otherwise -> do
raiseWith ParensOver
( _ : _ ) -> raiseWith SyntaxError
where
setContext w = do
co <- getC0
pure $ over _2 (set contextOf co) w
isBrace :: Char -> Bool
isBrace c = HM.member c braces
closing :: Char -> Maybe Char
closing c = HM.lookup c braces
braces :: HashMap Char Char
braces = HM.fromList[ ('{', '}')
, ('(', ')')
, ('[', ']')
, ('<', '>')
]
cBraces :: [Char]
cBraces = HM.elems braces
trNum tok = do
trans <- asks sexpTranslate
case tok of
Symbol s | trans -> do
let s0 = Text.unpack s
let what = Number . NumInteger <$> readMay @Integer s0
<|>
Number . NumInteger <$> parseBinary s0
<|>
Number . NumDouble <$> readMay @Scientific s0
<|>
( case s of
"#t" -> Just (Boolean True)
"#f" -> Just (Boolean False)
_ -> Nothing
)
pure $ fromMaybe (Symbol s) what
x -> pure x
{-# INLINE trNum #-}
list :: (ForMicroSexp c, MonadError SExpParseError m)
=> Char
-> [TTok]
-> SExpM m (MicroSexp c, [TTok])
list _ [] = raiseWith ParensUnder
list cb tokens = do
modify $ over sexpBraces (cb:)
go cb mempty tokens
where
isClosingFor :: Char -> Bool
isClosingFor c = c `elem` cBraces
go _ _ [] = do
checkBraces
pure (List mempty, mempty)
go cl acc (TPunct c : rest) | isSpace c = do
go cl acc rest
go cl acc (TPunct c : rest)
| isClosingFor c && c == cl = do
modify $ over sexpBraces (drop 1)
pure (List (reverse acc), rest)
| isClosingFor c && c /= cl = do
raiseWith ParensUnmatched
-- throwError =<< ParensUnmatched <$> undefined
go cl acc rest = do
(e,r) <- sexp rest
go cl (e : acc) r
getC0 :: Monad m => SExpM m C0
getC0 = do
lno <- gets (view sexpLno)
pure (C0 (Just lno))
raiseWith :: (MonadError SExpParseError m)
=> (C0 -> SExpParseError) -> SExpM m b
raiseWith a = throwError =<< a <$> getC0
instance Pretty NumType where
pretty = \case
NumInteger n -> pretty n
NumDouble n -> viaShow n
instance ForMicroSexp c => Pretty (MicroSexp c) where
pretty = \case
List xs -> parens (hsep (fmap pretty xs))
String s -> dquotes (pretty s)
Symbol s -> pretty s
Number n -> pretty n
Boolean True -> pretty "#t"
Boolean False -> pretty "#f"
isBinaryDigit :: Char -> Bool
isBinaryDigit c = c == '0' || c == '1'
parseBinary :: String -> Maybe Integer
parseBinary str =
let
withoutPrefix = case str of
'0':'b':rest -> Just rest
'0':'B':rest -> Just rest
_ -> Nothing
in if isJust withoutPrefix && all isBinaryDigit (fromJust withoutPrefix)
then Just $ foldl (\acc x -> acc * 2 + toInteger (digitToInt x)) 0 (fromJust withoutPrefix)
else Nothing

View File

@ -0,0 +1,15 @@
module Data.Text.Fuzzy.Section (cutSectionBy, cutSectionOn) where
import Data.Text (Text)
import qualified Data.List as List
cutSectionOn :: Text -> Text -> [Text] -> [Text]
cutSectionOn a b txt = cutSectionBy ((==)a) ((==b)) txt
cutSectionBy :: (Text -> Bool) -> (Text -> Bool) -> [Text] -> [Text]
cutSectionBy a b txt = cutI
where
cutC = List.dropWhile (not . a) txt
cutI = List.takeWhile (not . b) cutC

View File

@ -0,0 +1,556 @@
-- |
-- Module : Data.Text.Fuzzy.Tokenize
-- Copyright : Dmitry Zuikov 2020
-- License : MIT
--
-- Maintainer : dzuikov@gmail.com
-- Stability : experimental
-- Portability : unknown
--
-- The lightweight and multi-functional text tokenizer allowing different types of text tokenization
-- depending on it's settings.
--
-- It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
-- easier and sometimes faster than in case of usage mainstream parsing combinators or parser
-- generators.
--
-- The primary goal of this package is to parse unstructured text data, however it may be used for
-- parsing such data formats as CSV with ease.
--
-- Currently it supports the following types of entities: atoms, string literals (currently with the
-- minimal set of escaped characters), punctuation characters and delimeters.
--
-- == Examples
-- === Simple CSV-like tokenization
-- >>> tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
-- ["aaa "," bebeb "," qqq "]
--
-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
-- ["aaa "," bebeb "," qqq ","","","",""]
--
-- >>>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
-- [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
--
-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
-- [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
--
-- >>> let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
-- >>> tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
-- [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
--
-- >>> let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
-- >>> tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
-- [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
--
-- == Notes
--
-- === About the delimeter tokens
-- This type of tokens appears during a "delimited"
-- formats processing and disappears in results. Currenly
-- you will never see it unless normalization is turned off by 'nn' option.
--
-- The delimeters make sense in case of processing the CSV-like formats,
-- but in this case you probably need only values in results.
--
-- This behavior may be changed later. But right now delimeters seem pointless
-- in results. If you process some sort of grammar where delimeter character
-- is important, you may use punctuation instead, i.e:
--
-- >>> let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
-- >>> tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
-- ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
--
-- == Other
-- For CSV-like formats it makes sense to split text to lines first,
-- otherwise newline characters may cause to weird results
--
--
module Data.Text.Fuzzy.Tokenize ( TokenizeSpec
, IsToken(..)
, tokenize
, esc
, addEmptyFields
, emptyFields
, nn
, sq
, sqq
, noslits
, sl
, sr
, uw
, delims
, comment
, punct
, indent
, itabstops
, keywords
, eol
) where
import Prelude hiding (init)
import Control.Applicative
import Data.Map (Map)
import Data.Maybe (fromMaybe)
import Data.Monoid()
import Data.Set (Set)
import Data.Text (Text)
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Text as Text
import Control.Monad (when)
import Control.Monad.RWS
-- | Tokenization settings. Use mempty for an empty value
-- and construction functions for changing the settings.
--
data TokenizeSpec = TokenizeSpec { tsAtoms :: Set Text
, tsStringQQ :: Maybe Bool
, tsStringQ :: Maybe Bool
, tsNoSlits :: Maybe Bool
, tsLineComment :: Map Char Text
, tsDelims :: Set Char
, tsEol :: Maybe Bool
, tsStripLeft :: Maybe Bool
, tsStripRight :: Maybe Bool
, tsUW :: Maybe Bool
, tsNotNormalize :: Maybe Bool
, tsEsc :: Maybe Bool
, tsAddEmptyFields :: Maybe Bool
, tsPunct :: Set Char
, tsIndent :: Maybe Bool
, tsItabStops :: Maybe Int
, tsKeywords :: Set Text
}
deriving (Eq,Ord,Show)
instance Semigroup TokenizeSpec where
(<>) a b = TokenizeSpec { tsAtoms = tsAtoms b <> tsAtoms a
, tsStringQQ = tsStringQQ b <|> tsStringQQ a
, tsStringQ = tsStringQ b <|> tsStringQ a
, tsNoSlits = tsNoSlits b <|> tsNoSlits a
, tsLineComment = tsLineComment b <> tsLineComment a
, tsDelims = tsDelims b <> tsDelims a
, tsEol = tsEol b <|> tsEol a
, tsStripLeft = tsStripLeft b <|> tsStripLeft a
, tsStripRight = tsStripRight b <|> tsStripRight a
, tsUW = tsUW b <|> tsUW a
, tsNotNormalize = tsNotNormalize b <|> tsNotNormalize a
, tsEsc = tsEsc b <|> tsEsc a
, tsAddEmptyFields = tsAddEmptyFields b <|> tsAddEmptyFields a
, tsPunct = tsPunct b <> tsPunct a
, tsIndent = tsIndent b <|> tsIndent a
, tsItabStops = tsItabStops b <|> tsItabStops a
, tsKeywords = tsKeywords b <> tsKeywords a
}
instance Monoid TokenizeSpec where
mempty = TokenizeSpec { tsAtoms = mempty
, tsStringQQ = Nothing
, tsStringQ = Nothing
, tsNoSlits = Nothing
, tsLineComment = mempty
, tsDelims = mempty
, tsEol = Nothing
, tsStripLeft = Nothing
, tsStripRight = Nothing
, tsUW = Nothing
, tsNotNormalize = Nothing
, tsEsc = Nothing
, tsAddEmptyFields = Nothing
, tsPunct = mempty
, tsIndent = Nothing
, tsItabStops = Nothing
, tsKeywords = mempty
}
justTrue :: Maybe Bool -> Bool
justTrue (Just True) = True
justTrue _ = False
-- | Turns on EOL token generation
eol :: TokenizeSpec
eol = mempty { tsEol = pure True }
-- | Turn on character escaping inside string literals.
-- Currently the following escaped characters are
-- supported: [" ' \ t n r a b f v ]
esc :: TokenizeSpec
esc = mempty { tsEsc = pure True }
-- | Raise empty field tokens (note mkEmpty method)
-- when no tokens found before a delimeter.
-- Useful for processing CSV-like data in
-- order to distingush empty columns
addEmptyFields :: TokenizeSpec
addEmptyFields = mempty { tsAddEmptyFields = pure True }
-- | same as addEmptyFields
emptyFields :: TokenizeSpec
emptyFields = addEmptyFields
-- | Turns off token normalization. Makes the tokenizer
-- generate character stream. Useful for debugging.
nn :: TokenizeSpec
nn = mempty { tsNotNormalize = pure True }
-- | Turns on single-quoted string literals.
-- Character stream after '\'' character
-- will be proceesed as single-quoted stream,
-- assuming all delimeter, comment and other special
-- characters as a part of the string literal until
-- the next unescaped single quote character.
sq :: TokenizeSpec
sq = mempty { tsStringQ = pure True }
-- | Enable double-quoted string literals support
-- as 'sq' for single-quoted strings.
sqq :: TokenizeSpec
sqq = mempty { tsStringQQ = pure True }
-- | Disable separate string literals.
--
-- Useful when processed delimeted data (csv-like formats).
-- Normally, sequential text chunks are concatenated together,
-- but consequent text and string literal will produce the two
-- different tokens and it may cause weird results if data
-- is in csv-like format, i.e:
--
-- >>> tokenize (delims ":"<>emptyFields<>sq ) "aaa:bebe:'qq' aaa:next::" :: [Maybe Text]
-- [Just "aaa",Just "bebe",Just "qq",Just " aaa",Just "next",Nothing,Nothing]
--
-- look: "qq" and " aaa" are turned into two separate tokens that makes the result
-- of CSV processing looks improper, like it has an extra-column. This behavior may be
-- avoided using this option, if you don't need to distinguish text chunks and string
-- literals:
--
-- >>> tokenize (delims ":"<>emptyFields<>sq<>noslits) "aaa:bebe:'qq:foo' aaa:next::" :: [Maybe Text]
-- [Just "aaa",Just "bebe",Just "qq:foo aaa",Just "next",Nothing,Nothing]
--
noslits :: TokenizeSpec
noslits = mempty { tsNoSlits = pure True }
-- | Specify the list of delimers (characters)
-- to split the character stream into fields. Useful for CSV-like separated formats. Support for
-- empty fields in token stream may be enabled by 'addEmptyFields' function
delims :: String -> TokenizeSpec
delims s = mempty { tsDelims = Set.fromList s }
-- | Strip spaces on left side of a token.
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
sl :: TokenizeSpec
sl = mempty { tsStripLeft = pure True }
-- | Strip spaces on right side of a token.
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
sr :: TokenizeSpec
sr = mempty { tsStripRight = pure True }
-- | Strips spaces on right and left sides and transforms multiple spaces into the one.
-- Name origins from unwords . words
--
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
uw :: TokenizeSpec
uw = mempty { tsUW = pure True }
-- | Specify the line comment prefix.
-- All text after the line comment prefix will
-- be ignored until the newline character appearance.
-- Multiple line comments are supported.
comment :: Text -> TokenizeSpec
comment s = mempty { tsLineComment = cmt }
where
cmt = case Text.uncons s of
Just (p,su) -> Map.singleton p su
Nothing -> mempty
-- | Specify the punctuation characters.
-- Any punctuation character is handled as a separate
-- token.
-- Any token will be breaked on a punctiation character.
--
-- Useful for handling ... er... punctuaton, like
--
-- >> function(a,b)
--
-- or
--
-- >> (apply function 1 2 3)
--
--
-- >>> tokenize spec "(apply function 1 2 3)" :: [Text]
-- ["(","apply","function","1","2","3",")"]
--
punct :: Text -> TokenizeSpec
punct s = mempty { tsPunct = Set.fromList (Text.unpack s) }
-- | Specify the keywords list.
-- Each keyword will be threated as a separate token.
keywords :: [Text] -> TokenizeSpec
keywords s = mempty { tsKeywords = Set.fromList s }
-- | Enable identation support
indent :: TokenizeSpec
indent = mempty { tsIndent = Just True }
-- | Set tab expanding multiplier
-- i.e. each tab extends into n spaces before processing.
-- It also turns on the indentation. Only the tabs at the beginning of the string are expanded,
-- i.e. before the first non-space character appears.
itabstops :: Int -> TokenizeSpec
itabstops n = mempty { tsIndent = Just True, tsItabStops = pure n }
newtype TokenizeM w a = TokenizeM (RWS TokenizeSpec w () a)
deriving( Applicative
, Functor
, MonadReader TokenizeSpec
, MonadWriter w
, MonadState ()
, Monad
)
data Token = TChar Char
| TSChar Char
| TPunct Char
| TText Text
| TSLit Text
| TKeyword Text
| TEmpty
| TDelim
| TIndent Int
| TEol
deriving (Eq,Ord,Show)
-- | Typeclass for token values.
-- Note, that some tokens appear in results
-- only when 'nn' option is set, i.e. sequences
-- of characters turn out to text tokens or string literals
-- and delimeter tokens are just removed from the
-- results
class IsToken a where
-- | Create a character token
mkChar :: Char -> a
-- | Create a string literal character token
mkSChar :: Char -> a
-- | Create a punctuation token
mkPunct :: Char -> a
-- | Create a text chunk token
mkText :: Text -> a
-- | Create a string literal token
mkStrLit :: Text -> a
-- | Create a keyword token
mkKeyword :: Text -> a
-- | Create an empty field token
mkEmpty :: a
-- | Create a delimeter token
mkDelim :: a
mkDelim = mkEmpty
-- | Creates an indent token
mkIndent :: Int -> a
mkIndent = const mkEmpty
-- | Creates an EOL token
mkEol :: a
mkEol = mkEmpty
instance IsToken (Maybe Text) where
mkChar = pure . Text.singleton
mkSChar = pure . Text.singleton
mkPunct = pure . Text.singleton
mkText = pure
mkStrLit = pure
mkKeyword = pure
mkEmpty = Nothing
instance IsToken Text where
mkChar = Text.singleton
mkSChar = Text.singleton
mkPunct = Text.singleton
mkText = id
mkStrLit = id
mkKeyword = id
mkEmpty = ""
-- | Tokenize a text
tokenize :: IsToken a => TokenizeSpec -> Text -> [a]
tokenize s t = map tr t1
where
t1 = tokenize' s t
tr (TChar c) = mkChar c
tr (TSChar c) = mkSChar c
tr (TText c) = mkText c
tr (TSLit c) = mkStrLit c
tr (TKeyword c) = mkKeyword c
tr TEmpty = mkEmpty
tr (TPunct c) = mkPunct c
tr TDelim = mkDelim
tr (TIndent n) = mkIndent n
tr TEol = mkEol
execTokenizeM :: TokenizeM [Token] a -> TokenizeSpec -> [Token]
execTokenizeM (TokenizeM m) spec =
let (_,w) = execRWS m spec () in norm w
where norm x | justTrue (tsNotNormalize spec) = x
| otherwise = normalize spec x
tokenize' :: TokenizeSpec -> Text -> [Token]
tokenize' spec txt = execTokenizeM (root' txt) spec
where
r = spec
noIndent = not doIndent
doIndent = justTrue (tsIndent r)
eolOk = justTrue (tsEol r)
root' x = scanIndent x >>= root
root ts = do
case Text.uncons ts of
Nothing -> pure ()
Just ('\n', rest) | doIndent -> raiseEol >> root' rest
Just (c, rest) | Set.member c (tsDelims r) -> tell [TDelim] >> root rest
Just ('\'', rest) | justTrue (tsStringQ r) -> scanQ '\'' rest
Just ('"', rest) | justTrue (tsStringQQ r) -> scanQ '"' rest
Just (c, rest) | Map.member c (tsLineComment r) -> scanComment (c,rest)
Just (c, rest) | Set.member c (tsPunct r) -> tell [TPunct c] >> root rest
Just (c, rest) | otherwise -> tell [TChar c] >> root rest
raiseEol | eolOk = tell [TEol]
| otherwise = pure ()
expandSpace ' ' = 1
expandSpace '\t' = (fromMaybe 8 (tsItabStops r))
expandSpace _ = 0
scanIndent x | noIndent = pure x
| otherwise = do
let (ss,as) = Text.span (\c -> c == ' ' || c == '\t') x
tell [ TIndent (sum (map expandSpace (Text.unpack ss))) ]
pure as
scanComment (c,rest) = do
suff <- Map.lookup c <$> asks tsLineComment
case suff of
Just t | Text.isPrefixOf t rest -> do
root $ Text.dropWhile ('\n' /=) rest
_ -> tell [TChar c] >> root rest
scanQ q ts = do
case Text.uncons ts of
Nothing -> root ts
Just ('\\', rest) | justTrue (tsEsc r) -> unesc (scanQ q) rest
| otherwise -> tell [tsChar '\\'] >> scanQ q rest
Just (c, rest) | c == q -> root rest
| otherwise -> tell [tsChar c] >> scanQ q rest
unesc f ts =
case Text.uncons ts of
Nothing -> f ts
Just ('"', rs) -> tell [tsChar '"' ] >> f rs
Just ('\'', rs) -> tell [tsChar '\''] >> f rs
Just ('\\', rs) -> tell [tsChar '\\'] >> f rs
Just ('t', rs) -> tell [tsChar '\t'] >> f rs
Just ('n', rs) -> tell [tsChar '\n'] >> f rs
Just ('r', rs) -> tell [tsChar '\r'] >> f rs
Just ('a', rs) -> tell [tsChar '\a'] >> f rs
Just ('b', rs) -> tell [tsChar '\b'] >> f rs
Just ('f', rs) -> tell [tsChar '\f'] >> f rs
Just ('v', rs) -> tell [tsChar '\v'] >> f rs
Just (_, rs) -> f rs
tsChar c | justTrue (tsNoSlits spec) = TChar c
| otherwise = TSChar c
newtype NormStats = NormStats { nstatBeforeDelim :: Int }
normalize :: TokenizeSpec -> [Token] -> [Token]
normalize spec tokens = snd $ execRWS (go tokens) () init
where
go [] = addEmptyField
go s@(TIndent _ : _) = do
let (iis, rest') = List.span isIndent s
tell [TIndent (sum [k | TIndent k <- iis])]
go rest'
go (TChar c0 : cs) = do
let (n,ns) = List.span isTChar cs
succStat
let chunk = eatSpaces $ Text.pack (c0 : [ c | TChar c <- n])
let kw = Set.member chunk (tsKeywords spec)
tell [ if kw then TKeyword chunk else TText chunk ]
go ns
go (TSChar x : xs) = do
let (n,ns) = List.span isTSChar xs
succStat
tell [ TSLit $ Text.pack (x : [ c | TSChar c <- n]) ]
go ns
go (TDelim : xs) = do
addEmptyField
pruneStat
go xs
go (TPunct c : xs) = do
tell [ TPunct c ]
succStat
go xs
go (x:xs) = tell [x] >> go xs
succStat = do
modify (\x -> x { nstatBeforeDelim = succ (nstatBeforeDelim x)})
pruneStat = do
modify (\x -> x { nstatBeforeDelim = 0 } )
addEmptyField = do
ns <- gets nstatBeforeDelim
when (ns == 0 && justTrue (tsAddEmptyFields spec) ) $ do
tell [ TEmpty ]
isTChar (TChar _) = True
isTChar _ = False
isTSChar (TSChar _) = True
isTSChar _ = False
isIndent (TIndent _) = True
isIndent _ = False
init = NormStats { nstatBeforeDelim = 0 }
eatSpaces s | sboth = Text.strip s
| sLonly = Text.stripStart s
| sRonly = Text.stripEnd s
| sWU = (Text.unwords . Text.words) s
| otherwise = s
where sboth = justTrue (tsStripLeft spec) && justTrue (tsStripRight spec)
sLonly = justTrue (tsStripLeft spec) && not (justTrue (tsStripRight spec))
sRonly = not (justTrue (tsStripLeft spec)) && justTrue (tsStripRight spec)
sWU = justTrue (tsUW spec)

1
src/Data/Text/LSH.hs Normal file
View File

@ -0,0 +1 @@
module Data.Text.LSH where

150
test/FuzzyParseSpec.hs Normal file
View File

@ -0,0 +1,150 @@
{-# LANGUAGE OverloadedStrings
, QuasiQuotes
, ExtendedDefaultRules
, LambdaCase
, ImportQualifiedPost
, DerivingStrategies
, PatternSynonyms
, ViewPatterns
, MultiWayIf
, TemplateHaskell
#-}
module FuzzyParseSpec (spec) where
import Data.Text (Text)
import Test.Hspec
import Text.InterpolatedString.Perl6 (q)
import Data.Text.Fuzzy.Tokenize
import Data.Data
import Data.Generics.Uniplate.Data()
import GHC.Generics
data TTok = TChar Char
| TSChar Char
| TPunct Char
| TText Text
| TStrLit Text
| TKeyword Text
| TEmpty
| TIndent Int
deriving stock (Eq,Ord,Show,Data,Generic)
instance IsToken TTok where
mkChar = TChar
mkSChar = TSChar
mkPunct = TPunct
mkText = TText
mkStrLit = TStrLit
mkKeyword = TKeyword
mkEmpty = TEmpty
mkIndent = TIndent
spec :: Spec
spec = do
describe "csv-like" $ do
it "splits text using ':' delimeter" $ do
let toks = tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
toks `shouldBe` ["aaa "," bebeb "," qqq "]
it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
toks `shouldBe` ["aaa "," bebeb "," qqq ","","","",""]
it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
toks `shouldBe` [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
it "splits text using ':' delimeter with single-quotes string and empty fields with noslits" $ do
let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
let toks = tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
toks `shouldBe` [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
it "splits text using ':' delimeter with single-quotes string and empty fields with noslits and uw" $ do
let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
let toks = tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
toks `shouldBe` [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
it "uses punctuation tokens" $ do
let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
let toks = tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
toks `shouldBe` ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
it "tokenize simple lisp-like text with keywords" $ do
let spec = delims " \n\t" <> comment ";"
<> punct "{}()[]<>"
<> sq <> sqq
<> uw
<> keywords ["define","apply","+"]
let code = [q|
(define add (a b ) ; define simple function
(+ a b) )
(define r (add 10 20))
|]
let toks = tokenize spec code :: [TTok]
let expected = [ TPunct '('
, TKeyword "define"
, TText "add" , TPunct '(', TText "a" , TText "b", TPunct ')'
, TPunct '(', TKeyword "+", TText "a",TText "b",TPunct ')',TPunct ')'
, TPunct '(',TKeyword "define"
,TText "r"
,TPunct '(',TText "add",TText "10",TText "20"
,TPunct ')',TPunct ')']
toks `shouldBe` expected
describe "Checks indentation support" $ do
let spec = delims " \n\t" <> comment ";"
<> punct "{}()[]<>"
<> sq <> sqq
<> uw
<> indent
<> itabstops 8
<> keywords ["define"]
it "parses some indented blocks" $ do
let expected = [ TIndent 0, TKeyword "define", TText "a", TText "0"
, TIndent 2, TText "atom", TText "foo", TText "2"
, TIndent 2, TKeyword "define", TText "aq", TText "2"
, TIndent 4, TText "atom", TText "one", TText "4"
, TIndent 4, TText "atom", TText "two", TText "4"
, TIndent 0, TKeyword "define", TText "b", TText "0"
, TIndent 2, TText "atom", TText "baar", TText "2"
, TIndent 2, TText "atom", TText "quux", TText "2"
, TIndent 2, TKeyword "define", TText "new", TText "2"
, TIndent 6, TText "atom", TText "bar", TText "6"
, TIndent 4, TText "atom", TText "fuu", TText "4"
, TIndent 0
]
let pyLike = [q|
define a 0
atom foo 2
define aq 2
atom one 4
atom two 4
define b 0
atom baar 2
atom quux 2
define new 2
atom bar 6
atom fuu 4
|]
let toks = tokenize spec pyLike :: [TTok]
toks `shouldBe` expected

1
test/Spec.hs Normal file
View File

@ -0,0 +1 @@
{-# OPTIONS_GHC -F -pgmF hspec-discover #-}