Squashed 'miscellaneous/fuzzy-parse/' content from commit a834b152e

git-subtree-dir: miscellaneous/fuzzy-parse git-subtree-split: a834b152e29d632c816eefe117036e5d9330bd03
2024-10-07 05:06:03 +03:00 · 2024-10-07 05:06:03 +03:00 · cf85d2df2a
commit cf85d2df2a
26 changed files with 1854 additions and 0 deletions
--- a/.envrc
+++ b/.envrc
@ -0,0 +1,5 @@
 if [ -f .envrc.local ]; then
  source_env .envrc.local
 fi
 use flake
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
 *.swp
 dist-newstyle/
 Setup.hs
 .direnv
 .hbs2-git/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,10 @@
 # Revision history for fuzzy-parse
 ## 0.1.2.0
 - Techical release
 - Added some missed things
 ## 0.1.0.0 -- YYYY-mm-dd
 * First version. Released on an unsuspecting world.
--- a/20
+++ b/20
@ -0,0 +1,20 @@
 Copyright (c) 2019 Dmitry Zuikov
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
 "Software"), to deal in the Software without restriction, including
 without limitation the rights to use, copy, modify, merge, publish,
 distribute, sublicense, and/or sell copies of the Software, and to
 permit persons to whom the Software is furnished to do so, subject to
 the following conditions:
 The above copyright notice and this permission notice shall be included
 in all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.markdown
+++ b/README.markdown
@ -0,0 +1,131 @@
 # About
 # Data.Text.Fuzzy.Tokenize
 The lightweight and multi-functional text tokenizer allowing different types of text tokenization
 depending on it's settings.
 It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
 easier and sometimes faster than in case of usage mainstream parsing combinators or parser
 generators.
 The primary goal of this package  is to parse unstructured text data, however it may be  used for
 parsing  such data formats as CSV with ease.
 Currently it supports the following types of entities: atoms, string literals (currently with the
 minimal set of escaped characters), punctuation characters and delimeters.
 ## Examples
 ### Simple CSV-like tokenization
 ```haskell
 tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
 ["aaa "," bebeb "," qqq "]
 ```
 ```haskell
 tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
 ["aaa "," bebeb "," qqq ","","","",""]
 ```
 ```haskell
 tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
 [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
 ```
 ```haskell
 tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
 [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
 ```
 ```haskell
 let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
 tokenize spec "   aaa :   'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
 [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
 ```
 ```haskell
 let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
 tokenize spec "  a  b  c  : 'bebeb:colon inside' : qqq ::::"  :: [Maybe Text]
 [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
 ```
 ### Primitive lisp-like language
 ```haskell
 {-# LANGUAGE QuasiQuotes, ExtendedDefaultRules #-}
 import Text.InterpolatedString.Perl6 (q)
 import Data.Text.Fuzzy.Tokenize
 data TTok = TChar Char
          | TSChar Char
          | TPunct Char
          | TText Text
          | TStrLit Text
          | TKeyword Text
          | TEmpty
          deriving(Eq,Ord,Show)
 instance IsToken TTok where
 mkChar = TChar
 mkSChar = TSChar
 mkPunct = TPunct
 mkText = TText
 mkStrLit = TStrLit
 mkKeyword = TKeyword
 mkEmpty = TEmpty
 main = do
   let spec = delims " \n\t" <> comment ";"
                             <> punct "{}()[]<>"
                             <> sq <> sqq
                             <> uw
                             <> keywords ["define","apply","+"]
     let code = [q|
       (define add (a b ) ; define simple function
         (+ a b) )
       (define r (add 10 20))
 |]
     let toks = tokenize spec code :: [TTok]
   print toks
 ```
 ## Notes
 ### About the delimeter tokens
 This type of tokens appears during a "delimited" formats processing and disappears in results.
 Currenly you will never see it unless normalization is turned off by 'nn' option.
 The delimeters make sense in case of processing the CSV-like formats, but in this case you probably
 need only values in results.
 This behavior may be changed later. But right now delimeters seem pointless in results. If you
 process some sort of grammar where delimeter character is important, you may use punctuation
 instead, i.e:
 ```haskell
 let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
 tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
 ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
 ```
 ### Other
 For CSV-like formats it makes sense to split text to lines first, otherwise newline characters may
 cause to weird results
 # Authors
 This library is written and maintained by Dmitry Zuikov, dzuikov@gmail.com
--- a/12
+++ b/12
@ -0,0 +1,12 @@
 - [ ] TODO: Tests for Data.Text.Fuzzy.Section
 - [ ] TODO: haddock for Data.Text.Fuzzy.Section
 - [~] TODO: Tests
 - [+] TODO: Freeze dependencies versions
 - [ ] TODO: Version number
 - [+] TODO: Haddocks
 - [ ] TODO: Tokenizer: Identation support
 - [ ] TODO: Tokenizer: Block comments support
--- a/cabal.project
+++ b/cabal.project
@ -0,0 +1,6 @@
 packages: *.cabal
 allow-newer: all
 tests: True
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,61 @@
 {
  "nodes": {
    "flake-utils": {
      "locked": {
        "lastModified": 1644229661,
        "narHash": "sha256-1YdnJAsNy69bpcjuoKdOYQX0YxZBiCYZo4Twxerqv7k=",
        "owner": "numtide",
        "repo": "flake-utils",
        "rev": "3cecb5b042f7f209c56ffd8371b2711a290ec797",
        "type": "github"
      },
      "original": {
        "owner": "numtide",
        "repo": "flake-utils",
        "type": "github"
      }
    },
    "haskell-flake-utils": {
      "inputs": {
        "flake-utils": "flake-utils"
      },
      "locked": {
        "lastModified": 1707809372,
        "narHash": "sha256-wfTL9PlCSOqSSyU4eenFFI7pHrV21gba4GEILnI4nAU=",
        "owner": "ivanovs-4",
        "repo": "haskell-flake-utils",
        "rev": "3cbdc5d6093e8b4464ae64097e0c8c61e4414ff2",
        "type": "github"
      },
      "original": {
        "owner": "ivanovs-4",
        "repo": "haskell-flake-utils",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
        "lastModified": 1727089097,
        "narHash": "sha256-ZMHMThPsthhUREwDebXw7GX45bJnBCVbfnH1g5iuSPc=",
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "568bfef547c14ca438c56a0bece08b8bb2b71a9c",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "ref": "nixpkgs-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "haskell-flake-utils": "haskell-flake-utils",
        "nixpkgs": "nixpkgs"
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,69 @@
 {
 description = "Haskell cabal package";
 inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
    haskell-flake-utils.url = "github:ivanovs-4/haskell-flake-utils";
    # another-simple-haskell-flake.url = "something";
    # some-cabal-pkg.url = "github:example/some-cabal-pkg";
    # some-cabal-pkg.flake = false;
 };
 outputs = { self, nixpkgs, haskell-flake-utils, ... }@inputs:
    haskell-flake-utils.lib.simpleCabal2flake {
      inherit self nixpkgs;
      # systems = [ "x86_64-linux" ];
      # DON'T FORGET TO PUT YOUR PACKAGE NAME HERE, REMOVING `throw`
      name = "fuzzy-parse";
      shellExtBuildInputs = {pkgs}: with pkgs; [
        haskellPackages.haskell-language-server
      ];
      # Wether to build hoogle in the default shell
      shellWithHoogle = true;
      ## Optional parameters follow
      # nixpkgs config
      # config = { };
      # Add another haskell flakes as requirements
      # haskellFlakes = [ inputs.another-simple-haskell-flake ];
      # Use this to load other flakes overlays to supplement nixpkgs
      # preOverlays = [ ];
      # Pass either a function or a file
      # preOverlay = ./overlay.nix;
      # Override haskell packages
      # hpPreOverrides = { pkgs }: new: old:
      #   with pkgs.haskell.lib; with haskell-flake-utils.lib;
      #   tunePackages pkgs old {
      #     some-haskellPackages-package = [ dontHaddock ];
      #   } // {
      #     some-cabal-pkg = ((jailbreakUnbreak pkgs) (dontCheck (old.callCabal2nix "some-cabal-pkg" inputs.some-cabal-pkg {})));
      #   };
      # Arguments for callCabal2nix
      # cabal2nixArgs = {pkgs}: {
      # };
      # Maps to the devShell output. Pass in a shell.nix file or function
      # shell = ./shell.nix
      # Additional build intputs of the default shell
      # shellExtBuildInputs = {pkgs}: with pkgs; [
      #   haskellPackages.haskell-language-server
      # ];
      # Wether to build hoogle in the default shell
      # shellWithHoogle = true;
    };
 }
--- a/fuzzy-parse.cabal
+++ b/fuzzy-parse.cabal
@ -0,0 +1,160 @@
 cabal-version:       3.0
 name:                fuzzy-parse
 version:             0.1.3.1
 synopsis:            Tools for processing unstructured text data
 description:
  The lightweight and easy to use functions for text tokenizing and parsing.  It aimed for
  parsing mostly unstructured data, but the structured formats may be parsed as well.
  It may be used in different sutiations, for DSL, tex markups or even for parsing simple
  grammars easier and sometimes faster than in case of usage mainstream parsing combinators
  or parser generators.
  See the README.markdown, examples and modules documentation for more.
 license:             MIT
 license-file:        LICENSE
 author:              Dmitry Zuikov
 maintainer:          dzuikov@gmail.com
 category:            Text, Parsing
 extra-source-files:  CHANGELOG.md
 homepage:            https://github.com/hexresearch/fuzzy-parse
 bug-reports:         https://github.com/hexresearch/fuzzy-parse/issues
 extra-source-files:
    README.markdown
 common shared-properties
  default-language:    GHC2021
  default-extensions:
                       ApplicativeDo
                     , BangPatterns
                     , BlockArguments
                     , ConstraintKinds
                     , DataKinds
                     , DeriveDataTypeable
                     , DeriveGeneric
                     , DerivingStrategies
                     , DerivingVia
                     , ExtendedDefaultRules
                     , FlexibleContexts
                     , FlexibleInstances
                     , GADTs
                     , GeneralizedNewtypeDeriving
                     , ImportQualifiedPost
                     , LambdaCase
                     , MultiParamTypeClasses
                     , OverloadedStrings
                     , QuasiQuotes
                     , RecordWildCards
                     , ScopedTypeVariables
                     , StandaloneDeriving
                     , TemplateHaskell
                     , TupleSections
                     , TypeApplications
                     , TypeFamilies
 library
  import:           shared-properties
  ghc-options:
        -Wall
        -fno-warn-type-defaults
        -O2
        "-with-rtsopts=-N4 -A64m -AL256m -I0"
  exposed-modules:     Data.Text.Fuzzy.Tokenize
                     , Data.Text.Fuzzy.Dates
                     , Data.Text.Fuzzy.Section
                     , Data.Text.Fuzzy.SExp
                     , Data.Text.Fuzzy.Attoparsec.Day
                     , Data.Text.Fuzzy.Attoparsec.Month
  build-depends:       base
                     , attoparsec
                     , containers
                     , mtl
                     , prettyprinter
                     , safe
                     , streaming
                     , scientific
                     , text
                     , time
                     , microlens-platform
                     , uniplate
                     , unliftio
                     , unordered-containers
                     , timeit
  hs-source-dirs:      src
 executable fuzzy-sexp-parse
    import:           shared-properties
    default-language:   GHC2021
    ghc-options:
          -Wall
          -fno-warn-type-defaults
          -O2
    main-is: FuzzySexpParse.hs
    hs-source-dirs: misc
    build-depends: base, fuzzy-parse
                 , containers
                 , hspec
                 , hspec-discover
                 , interpolatedstring-perl6
                 , text
                 , mtl
                 , streaming
                 , transformers
                 , exceptions
                 , uniplate
                 , microlens-platform
                 , safe
                 , timeit
                 , prettyprinter
 test-suite fuzzy-parse-test
    import:           shared-properties
    default-language:   GHC2021
    type:       exitcode-stdio-1.0
    main-is:    Spec.hs
    other-modules: FuzzyParseSpec
    hs-source-dirs: test
    build-depends: base, fuzzy-parse
                 , containers
                 , hspec
                 , hspec-discover
                 , interpolatedstring-perl6
                 , text
                 , mtl
                 , streaming
                 , transformers
                 , exceptions
                 , uniplate
                 , microlens-platform
                 , safe
                 , timeit
    build-tool-depends: hspec-discover:hspec-discover == 2.*
--- a/misc/FuzzySexpParse.hs
+++ b/misc/FuzzySexpParse.hs
@ -0,0 +1,30 @@
 module Main where
 import Data.Text.Fuzzy.SExp
 import Data.Text.IO qualified as IO
 import Data.Text qualified as Text
 import Data.Either
 import System.TimeIt
 import Control.Monad.Except
 import Data.Functor
 import Data.Function
 import Data.Fixed
 import Prettyprinter
 import System.IO
 main :: IO ()
 main = do
  s <- IO.getContents
  (tt,toks) <- timeItT do
                pure (tokenizeSexp s)
  (pt,top) <- timeItT  do
                runExceptT (parseTop @() s) <&> either (error.show) id
  print (vcat (fmap pretty top))
  hPrint stderr $ pretty (Text.length s) <+> "chars, parsed in" <+> viaShow (realToFrac pt  :: Fixed E6)
--- a/nix/derivations/.gitignore
+++ b/nix/derivations/.gitignore
--- a/nix/pkgs.json
+++ b/nix/pkgs.json
@ -0,0 +1,7 @@
 {
  "url": "https://github.com/NixOS/nixpkgs.git",
  "rev": "5cf0de2485efeccc307692eedadbb2d9bfdc7013",
  "date": "2020-06-04T17:30:37+08:00",
  "sha256": "07axrr50nlmnvba5ja2ihzjwczi66znak57bhcz472w22w7m3sd1",
  "fetchSubmodules": false
 }
--- a/nix/pkgs.nix
+++ b/nix/pkgs.nix
@ -0,0 +1,5 @@
 import ((import <nixpkgs> {}).fetchFromGitHub {
  owner = "NixOS";
  repo  = "nixpkgs";
  inherit (builtins.fromJSON (builtins.readFile ./pkgs.json)) rev sha256;
 })
--- a/nix/release.nix
+++ b/nix/release.nix
@ -0,0 +1,29 @@
 let
  pkgs = import ./pkgs.nix { inherit config;
                           };
  lib  = pkgs.haskell.lib;
  config = {
    packageOverrides = pkgs: rec {
      haskellPackages = pkgs.haskellPackages.override { overrides = haskOverrides; };
    };
  };
  gitignore = pkgs.callPackage (pkgs.fetchFromGitHub {
    owner  = "siers";
    repo   = "nix-gitignore";
    rev    = "ce0778ddd8b1f5f92d26480c21706b51b1af9166";
    sha256 = "1d7ab78i2k13lffskb23x8b5h24x7wkdmpvmria1v3wb9pcpkg2w";
  }) {};
  ignore = gitignore.gitignoreSourceAux ''
    .stack-work
    dist
    dist-newstyle
    .ghc.environment*
    '';
  haskOverrides = new: old:
    let overrides = lib.packagesFromDirectory { directory = ./derivations; } new old;
    in overrides;
 in rec {
  inherit pkgs;
  packages = { inherit (pkgs.haskellPackages) fuzzy-parse;
             };
 }
--- a/nix/update-nixpkgs.sh
+++ b/nix/update-nixpkgs.sh
@ -0,0 +1,3 @@
 #!/usr/bin/env nix-shell
 #! nix-shell -i bash -p nix-prefetch-git
 nix-prefetch-git https://github.com/NixOS/nixpkgs.git | tee pkgs.json
--- a/scripts/upload-docs.sh
+++ b/scripts/upload-docs.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 set -e
 dir=$(mktemp -d dist-docs.XXXXXX)
 trap 'rm -r "$dir"' EXIT
 # assumes cabal 2.4 or later
 cabal v2-haddock --builddir="$dir" --haddock-for-hackage --enable-doc
 cabal upload -d --publish $dir/*-docs.tar.gz
--- a/src/Data/Text/Fuzzy/Attoparsec/Day.hs
+++ b/src/Data/Text/Fuzzy/Attoparsec/Day.hs
@ -0,0 +1,97 @@
 module Data.Text.Fuzzy.Attoparsec.Day (  dayDMY
                                       , dayYMD
                                       , dayYYYYMMDD
                                       , dayDMonY
                                       , day
                                       ) where
 import Data.List (zipWith)
 import Control.Applicative ((<|>))
 import Data.Attoparsec.Text (Parser,decimal,digit,count,satisfy,inClass,skipWhile)
 import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
 import qualified Data.Char as Char
 import qualified Data.Text as Text
 day :: Parser Day
 day = dayDMonY <|> dayYYYYMMDD <|> dayYMD <|> dayDMY
 skipDelim :: Parser ()
 skipDelim = skipWhile (inClass " ./-")
 dayDMY :: Parser Day
 dayDMY = do
  d <- decimal :: Parser Int
  skipDelim
  m <- decimal :: Parser Int
  skipDelim
  y' <- decimal :: Parser Integer
  maybe (fail "bad date format") pure (makeDay y' m d)
 dayYMD :: Parser Day
 dayYMD = do
  y' <- decimal :: Parser Integer
  skipDelim
  m <- decimal :: Parser Int
  skipDelim
  d <- decimal :: Parser Int
  maybe (fail "bad date format") pure (makeDay y' m d)
 dayYYYYMMDD :: Parser Day
 dayYYYYMMDD = do
  y <- fromIntegral . num n4 . map o <$> count 4 digit
  m <- num n2 . map o <$> count 2 digit
  d <- num n2 . map o <$> count 2 digit
  maybe (fail "bad date format") pure (makeDay y m d)
  where n4 = [1000,100,10,1]
        n2 = [10,1]
        o x = Char.ord x - Char.ord '0'
        num n x = sum $ zipWith (*) x n
 dayDMonY :: Parser Day
 dayDMonY = do
  d <- decimal :: Parser Int
  skipDelim
  m <- pMon
  skipDelim
  y <- decimal :: Parser Integer
  maybe (fail "bad date format") pure (makeDay y m d)
  where
    pMon :: Parser Int
    pMon = do
      txt <- Text.toUpper . Text.pack  <$> count 3 (satisfy Char.isLetter)
      case txt of
        "JAN" -> pure 1
        "FEB" -> pure 2
        "MAR" -> pure 3
        "APR" -> pure 4
        "MAY" -> pure 5
        "JUN" -> pure 6
        "JUL" -> pure 7
        "AUG" -> pure 8
        "SEP" -> pure 9
        "OCT" -> pure 10
        "NOV" -> pure 11
        "DEC" -> pure 12
        _     -> fail "bad month name"
 makeYear :: Integer -> Maybe Integer
 makeYear y' = if y < 1900 && y' < 99
                then Nothing
                else pure y
  where
    y = if y' < 50
          then y' + 2000
          else (if y' >= 50 && y' <= 99
                  then y' + 1900
                  else y' )
 makeDay :: Integer -> Int -> Int -> Maybe Day
 makeDay y m d | m <= 12 && m > 0 =
  makeYear y >>= \yyyy -> if d <= gregorianMonthLength yyyy m
                            then pure $ fromGregorian yyyy m d
                            else Nothing
              | otherwise = Nothing
--- a/src/Data/Text/Fuzzy/Attoparsec/Month.hs
+++ b/src/Data/Text/Fuzzy/Attoparsec/Month.hs
@ -0,0 +1,46 @@
 module Data.Text.Fuzzy.Attoparsec.Month ( fuzzyMonth, fuzzyMonthFromText
                                        ) where
 import Control.Applicative ((<|>))
 import Data.Attoparsec.Text (Parser,decimal,digit,letter,many1,parseOnly)
 import Data.Map (Map)
 import Data.Maybe
 import Data.Text (Text)
 import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
 import qualified Data.Char as Char
 import qualified Data.Map as Map
 import qualified Data.Text as Text
 fuzzyMonth :: Parser Int
 fuzzyMonth  = pMonthNum <|> pMonth
 fuzzyMonthFromText :: Text -> Maybe Int
 fuzzyMonthFromText = either (const Nothing) Just . parseOnly fuzzyMonth
 pMonthNum :: Parser Int
 pMonthNum = do
  n <- decimal
  if n >= 1 && n <= 13
    then pure n
    else fail "invalid months num"
 pMonth :: Parser Int
 pMonth = do
  mo <- many1 (Char.toLower <$> letter)
  maybe (fail "invalid month name") pure (Map.lookup mo months)
  where
    months :: Map String Int
    months = Map.fromList [ ("jan",  1), ("january"  ,  1)
                          , ("feb",  2), ("febuary"  ,  2)
                          , ("mar",  3), ("march"    ,  3)
                          , ("apr",  4), ("april"    ,  4)
                          , ("may",  5), ("may"      ,  5)
                          , ("jun",  6), ("june"     ,  6)
                          , ("jul",  7), ("july"     ,  7)
                          , ("aug",  8), ("august"   ,  8)
                          , ("sep",  9), ("september",  9)
                          , ("oct", 10), ("october"  , 10)
                          , ("nov", 11), ("november" , 11)
                          , ("dec", 12), ("december" , 12)
                          ]
--- a/src/Data/Text/Fuzzy/Dates.hs
+++ b/src/Data/Text/Fuzzy/Dates.hs
@ -0,0 +1,46 @@
 -- |
 -- Module      :  Data.Text.Fuzzy.Dates
 -- Copyright   :  Dmitry Zuikov 2020
 -- License     :  MIT
 --
 -- Maintainer  :  dzuikov@gmail.com
 -- Stability   :  experimental
 -- Portability :  unknown
 --
 -- Dates fuzzy parsing.
 -- Supports a number of dates format and tries to recover
 -- the incomplete dates from text, with use of some
 -- reasonable assumptions. Does not support locales,
 -- i.e assums only English for dates yet.
 --
 -- == Examples
 --
 -- > parseMaybeDay "01.01.1979"
 -- > Just 1979-01-01
 -- > parseMaybeDay "01.01.01"
 -- > Just 2001-01-01
 -- > parseMaybeDay "13/01/2019"
 -- > Just 2019-01-13
 -- > parseMaybeDay "2019-12-1"
 -- > Just 2019-12-01
 -- > parseMaybeDay "21-feb-79"
 -- > Just 1979-02-21
 -- > parseMaybeDay "21-feb-01"
 -- > Just 2001-02-21
 -- > parseMaybeDay "29feb04"
 -- > Just 2004-02-29
 -- > parseMaybeDay "21feb28"
 -- > Just 2028-02-21
 module Data.Text.Fuzzy.Dates where
 import Data.Attoparsec.Text (parseOnly)
 import Data.Either (either)
 import Data.Text.Fuzzy.Attoparsec.Day
 import Data.Text (Text)
 import Data.Time.Calendar
 -- | Tries to parse a date from the text.
 parseMaybeDay :: Text -> Maybe Day
 parseMaybeDay s = either (const Nothing) pure (parseOnly day s)
--- a/src/Data/Text/Fuzzy/SExp.hs
+++ b/src/Data/Text/Fuzzy/SExp.hs
@ -0,0 +1,378 @@
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE QuasiQuotes #-}
 {-# LANGUAGE ExtendedDefaultRules #-}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE ImportQualifiedPost #-}
 {-# LANGUAGE DerivingStrategies #-}
 {-# LANGUAGE PatternSynonyms #-}
 {-# LANGUAGE ViewPatterns #-}
 {-# LANGUAGE TemplateHaskell #-}
 module Data.Text.Fuzzy.SExp where
 import Data.Text (Text)
 import Control.Applicative
 import Control.Monad
 import Data.Function
 import Data.Functor
 import Data.Text.Fuzzy.Tokenize
 import Control.Monad.Reader
 import Data.Typeable
 import Control.Monad.Except
 import Control.Monad.RWS
 import Data.Maybe
 import Data.Char (isSpace,digitToInt)
 import Data.Generics.Uniplate.Data()
 import Safe
 import Data.Data
 import GHC.Generics
 import Lens.Micro.Platform
 import Data.Text qualified as Text
 import Data.Coerce
 import Data.Scientific
 import Data.HashMap.Strict (HashMap)
 import Data.HashMap.Strict  qualified as HM
 import Prettyprinter hiding (braces,list)
 import Streaming.Prelude qualified as S
 data TTok = TChar Char
          | TSChar Char
          | TPunct Char
          | TText Text
          | TStrLit Text
          | TKeyword Text
          | TEmpty
          | TIndent Int
          deriving stock (Eq,Ord,Show,Data,Generic)
 instance IsToken TTok where
  mkChar = TChar
  mkSChar = TSChar
  mkPunct = TPunct
  mkText = TText
  mkStrLit = TStrLit
  mkKeyword = TKeyword
  mkEmpty = TEmpty
  mkIndent = TIndent
 newtype C0 = C0 (Maybe Int)
             deriving stock (Eq,Ord,Show,Data,Typeable,Generic)
 data SExpParseError =
    ParensOver  C0
  | ParensUnder C0
  | ParensUnmatched C0
  | SyntaxError C0
  deriving stock (Show,Typeable)
 data NumType =
    NumInteger Integer
  | NumDouble  Scientific
  deriving stock (Eq,Ord,Show,Data,Generic)
 class Monoid c => ForMicroSexp c where
 instance Monoid C0 where
  mempty = C0 Nothing
 instance Semigroup C0 where
  (<>) (C0 a) (C0 b) = C0 (b <|> a)
 instance ForMicroSexp C0 where
 instance ForMicroSexp () where
 data MicroSexp c =
    List_    c [MicroSexp c]
  | Symbol_  c Text
  | String_  c Text
  | Number_  c NumType
  | Boolean_ c Bool
  deriving stock (Show,Data,Generic)
 pattern List :: ForMicroSexp c => [MicroSexp c] -> MicroSexp c
 pattern List xs <- List_ _ xs where
  List xs = List_ mempty xs
 pattern Symbol :: ForMicroSexp c => Text -> MicroSexp c
 pattern Symbol xs <- Symbol_ _ xs where
  Symbol xs = Symbol_ mempty xs
 pattern String :: ForMicroSexp c => Text -> MicroSexp c
 pattern String x <- String_ _ x where
  String x = String_ mempty x
 pattern Number :: ForMicroSexp c => NumType -> MicroSexp c
 pattern Number n <- Number_ _ n where
  Number n = Number_ mempty n
 pattern Boolean :: ForMicroSexp c => Bool -> MicroSexp c
 pattern Boolean b <- Boolean_ _ b where
  Boolean b = Boolean_ mempty b
 {-# COMPLETE List, Symbol, String, Number, Boolean #-}
 contextOf :: Lens (MicroSexp c) (MicroSexp c) c c
 contextOf = lens g s
  where
    s sexp c = case sexp of
      List_    _  a -> List_ c a
      Symbol_  _  a -> Symbol_ c a
      String_  _  a -> String_ c a
      Number_  _  a -> Number_ c a
      Boolean_ _  a -> Boolean_ c a
    g = \case
      List_ c    _ -> c
      Symbol_ c  _ -> c
      String_ c  _ -> c
      Number_ c  _ -> c
      Boolean_ c _ -> c
 nil :: forall c . ForMicroSexp c => MicroSexp c
 nil = List []
 symbol :: forall c . ForMicroSexp c => Text -> MicroSexp c
 symbol = Symbol
 str :: forall c . ForMicroSexp c => Text -> MicroSexp c
 str = String
 newtype SExpEnv =
  SExpEnv
  { sexpTranslate :: Bool
  }
 data SExpState =
  SExpState
  { _sexpLno    :: Int
  , _sexpBraces :: [Char]
  }
 makeLenses 'SExpState
 defEnv :: SExpEnv
 defEnv = SExpEnv True
 newtype SExpM m a = SExpM { fromSexpM :: RWST SExpEnv () SExpState m a }
                    deriving newtype
                      ( Applicative
                      , Functor
                      , Monad
                      , MonadState SExpState
                      , MonadReader SExpEnv
                      , MonadTrans
                      )
 instance MonadError SExpParseError m => MonadError SExpParseError (SExpM m) where
  throwError = lift . throwError
  catchError w  = catchError (coerce $ fromSexpM w)
 tokenizeSexp :: Text -> [TTok]
 tokenizeSexp txt =  do
  let spec = delims " \r\t" <> comment ";"
                            <> punct "'{}()[]\n"
                            <> sqq
                            <> uw
  tokenize spec txt
 runSexpM :: Monad m => SExpM m a -> m a
 runSexpM f = evalRWST (fromSexpM f) defEnv (SExpState 0 []) <&> fst
 parseSexp :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m (MicroSexp c)
 parseSexp txt = do
  (s, _) <- runSexpM do
             (s,rest) <- sexp (tokenizeSexp txt)
             checkBraces
             pure (s,rest)
  pure s
 checkBraces :: (MonadError SExpParseError m) => SExpM m ()
 checkBraces = do
  braces <- gets (view sexpBraces)
  unless (null braces) $ raiseWith ParensUnder
 succLno :: (MonadError SExpParseError m) => SExpM m ()
 succLno = modify (over sexpLno succ)
 parseTop :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m [MicroSexp c]
 parseTop txt = do
  let tokens = tokenizeSexp txt
  S.toList_ $ runSexpM do
    flip fix (mempty,tokens) $ \next -> \case
      (acc, []) -> do
        emit acc
      (acc, TPunct '\n' : rest) -> do
        succLno
        emit acc
        next (mempty,rest)
      (acc, rest) -> do
        (s, xs) <- sexp rest
        next (acc <> [s],xs)
  where
    emit [] = pure ()
    emit wtf = case wtf of
      [List one] -> lift $ S.yield (List one)
      xs    -> lift $ S.yield (List xs)
 sexp :: (ForMicroSexp c, MonadError SExpParseError m) => [TTok] -> SExpM m (MicroSexp c, [TTok])
 sexp s = case s of
  [] -> do
    checkBraces
    pure (nil, mempty)
  (TText l : w) -> (,w) <$> trNum (Symbol l)
  (TStrLit l : w) -> pure (String l, w)
  -- so far ignored
  (TPunct '\'' : rest) -> sexp rest
  (TPunct '\n' : rest) -> succLno >> sexp rest
  (TPunct c : rest) | isSpace c  -> sexp rest
  (TPunct c : rest) | isBrace c  ->
    maybe (pure (nil, rest)) (`list` rest) (closing c)
                    | otherwise -> do
                        raiseWith ParensOver
  ( _ : _ ) -> raiseWith SyntaxError
  where
    setContext w = do
      co <- getC0
      pure $ over _2 (set contextOf co) w
    isBrace :: Char -> Bool
    isBrace c = HM.member c braces
    closing :: Char -> Maybe Char
    closing c = HM.lookup c braces
    braces :: HashMap Char Char
    braces = HM.fromList[ ('{', '}')
                        , ('(', ')')
                        , ('[', ']')
                        , ('<', '>')
                        ]
    cBraces :: [Char]
    cBraces = HM.elems braces
    trNum tok = do
      trans <- asks sexpTranslate
      case tok of
        Symbol s | trans -> do
          let s0 = Text.unpack s
          let what = Number . NumInteger <$> readMay @Integer s0
                    <|>
                    Number . NumInteger <$> parseBinary s0
                    <|>
                    Number . NumDouble <$> readMay @Scientific s0
                    <|>
                    ( case  s of
                        "#t" -> Just (Boolean True)
                        "#f" -> Just (Boolean False)
                        _    -> Nothing
                    )
          pure $ fromMaybe (Symbol s) what
        x        -> pure x
    {-# INLINE trNum #-}
    list :: (ForMicroSexp c, MonadError SExpParseError m)
         => Char
         -> [TTok]
         -> SExpM m (MicroSexp c, [TTok])
    list _ [] = raiseWith ParensUnder
    list cb tokens = do
      modify $ over sexpBraces (cb:)
      go cb mempty tokens
      where
        isClosingFor :: Char -> Bool
        isClosingFor c = c `elem` cBraces
        go _ _ [] = do
          checkBraces
          pure (List mempty, mempty)
        go cl acc (TPunct c : rest) | isSpace c = do
          go cl acc rest
        go cl acc (TPunct c : rest)
          | isClosingFor c && c == cl = do
              modify $ over sexpBraces (drop 1)
              pure (List (reverse acc), rest)
          | isClosingFor c && c /= cl = do
              raiseWith ParensUnmatched
              -- throwError =<< ParensUnmatched <$> undefined
        go cl acc rest = do
          (e,r) <- sexp rest
          go cl (e : acc) r
 getC0 :: Monad m => SExpM m C0
 getC0 = do
  lno <- gets (view sexpLno)
  pure (C0 (Just lno))
 raiseWith :: (MonadError SExpParseError m)
          => (C0 -> SExpParseError) -> SExpM m b
 raiseWith a = throwError =<< a <$> getC0
 instance Pretty NumType where
   pretty = \case
    NumInteger n -> pretty n
    NumDouble  n -> viaShow n
 instance ForMicroSexp c => Pretty (MicroSexp c) where
  pretty = \case
    List xs   -> parens (hsep (fmap pretty xs))
    String s  -> dquotes (pretty s)
    Symbol s  -> pretty s
    Number n  -> pretty n
    Boolean True -> pretty  "#t"
    Boolean False -> pretty  "#f"
 isBinaryDigit :: Char -> Bool
 isBinaryDigit c = c == '0' || c == '1'
 parseBinary :: String -> Maybe Integer
 parseBinary str =
  let
      withoutPrefix = case str of
                        '0':'b':rest -> Just rest
                        '0':'B':rest -> Just rest
                        _            -> Nothing
  in if isJust withoutPrefix && all isBinaryDigit (fromJust withoutPrefix)
     then Just $ foldl (\acc x -> acc * 2 + toInteger (digitToInt x)) 0 (fromJust withoutPrefix)
     else Nothing
--- a/src/Data/Text/Fuzzy/Section.hs
+++ b/src/Data/Text/Fuzzy/Section.hs
@ -0,0 +1,15 @@
 module Data.Text.Fuzzy.Section (cutSectionBy, cutSectionOn) where
 import Data.Text (Text)
 import qualified Data.List as List
 cutSectionOn :: Text -> Text -> [Text] -> [Text]
 cutSectionOn a b txt = cutSectionBy ((==)a) ((==b)) txt
 cutSectionBy :: (Text -> Bool) -> (Text -> Bool) -> [Text] -> [Text]
 cutSectionBy a b txt = cutI
  where
    cutC = List.dropWhile (not . a) txt
    cutI = List.takeWhile (not . b) cutC
--- a/src/Data/Text/Fuzzy/Tokenize.hs
+++ b/src/Data/Text/Fuzzy/Tokenize.hs
@ -0,0 +1,556 @@
 -- |
 -- Module      :  Data.Text.Fuzzy.Tokenize
 -- Copyright   :  Dmitry Zuikov 2020
 -- License     :  MIT
 --
 -- Maintainer  :  dzuikov@gmail.com
 -- Stability   :  experimental
 -- Portability :  unknown
 --
 -- The lightweight and multi-functional text tokenizer allowing different types of text tokenization
 -- depending on it's settings.
 --
 -- It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
 -- easier and sometimes faster than in case of usage mainstream parsing combinators or parser
 -- generators.
 --
 -- The primary goal of this package  is to parse unstructured text data, however it may be  used for
 -- parsing  such data formats as CSV with ease.
 --
 -- Currently it supports the following types of entities: atoms, string literals (currently with the
 -- minimal set of escaped characters), punctuation characters and delimeters.
 --
 -- == Examples
 -- === Simple CSV-like tokenization
 -- >>> tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
 -- ["aaa "," bebeb "," qqq "]
 --
 -- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
 -- ["aaa "," bebeb "," qqq ","","","",""]
 --
 -- >>>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
 -- [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
 --
 -- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
 -- [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
 --
 -- >>> let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
 -- >>> tokenize spec "   aaa :   'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
 -- [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
 --
 -- >>> let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
 -- >>> tokenize spec "  a  b  c  : 'bebeb:colon inside' : qqq ::::"  :: [Maybe Text]
 -- [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
 --
 -- == Notes
 --
 -- === About the delimeter tokens
 -- This type of tokens appears during a "delimited"
 -- formats processing and disappears in results. Currenly
 -- you will never see it unless normalization is turned off by 'nn' option.
 --
 -- The delimeters make sense in case of processing the CSV-like formats,
 -- but in this case you probably need only values in results.
 --
 -- This behavior may be changed later. But right now delimeters seem pointless
 -- in results. If you process some sort of grammar where delimeter character
 -- is important, you may use punctuation instead, i.e:
 --
 -- >>> let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
 -- >>> tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
 -- ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
 --
 -- == Other
 -- For CSV-like formats it makes sense to split text to lines first,
 -- otherwise newline characters may cause to weird results
 --
 --
 module Data.Text.Fuzzy.Tokenize ( TokenizeSpec
                                , IsToken(..)
                                , tokenize
                                , esc
                                , addEmptyFields
                                , emptyFields
                                , nn
                                , sq
                                , sqq
                                , noslits
                                , sl
                                , sr
                                , uw
                                , delims
                                , comment
                                , punct
                                , indent
                                , itabstops
                                , keywords
                                , eol
                                ) where
 import Prelude hiding (init)
 import Control.Applicative
 import Data.Map (Map)
 import Data.Maybe (fromMaybe)
 import Data.Monoid()
 import Data.Set (Set)
 import Data.Text (Text)
 import qualified Data.List as List
 import qualified Data.Map as Map
 import qualified Data.Set as Set
 import qualified Data.Text as Text
 import Control.Monad (when)
 import Control.Monad.RWS
 -- | Tokenization settings. Use mempty for an empty value
 -- and construction functions for changing the settings.
 --
 data TokenizeSpec = TokenizeSpec { tsAtoms          :: Set Text
                                 , tsStringQQ       :: Maybe Bool
                                 , tsStringQ        :: Maybe Bool
                                 , tsNoSlits        :: Maybe Bool
                                 , tsLineComment    :: Map Char Text
                                 , tsDelims         :: Set Char
                                 , tsEol            :: Maybe Bool
                                 , tsStripLeft      :: Maybe Bool
                                 , tsStripRight     :: Maybe Bool
                                 , tsUW             :: Maybe Bool
                                 , tsNotNormalize   :: Maybe Bool
                                 , tsEsc            :: Maybe Bool
                                 , tsAddEmptyFields :: Maybe Bool
                                 , tsPunct          :: Set Char
                                 , tsIndent         :: Maybe Bool
                                 , tsItabStops      :: Maybe Int
                                 , tsKeywords       :: Set Text
                                 }
                    deriving (Eq,Ord,Show)
 instance Semigroup TokenizeSpec where
  (<>) a b = TokenizeSpec { tsAtoms       = tsAtoms b <> tsAtoms a
                          , tsStringQQ    = tsStringQQ b <|> tsStringQQ a
                          , tsStringQ     = tsStringQ b  <|> tsStringQ a
                          , tsNoSlits     = tsNoSlits b <|> tsNoSlits a
                          , tsLineComment = tsLineComment b <> tsLineComment a
                          , tsDelims      = tsDelims b <> tsDelims a
                          , tsEol         = tsEol b <|> tsEol a
                          , tsStripLeft   = tsStripLeft b <|> tsStripLeft a
                          , tsStripRight  = tsStripRight b <|> tsStripRight a
                          , tsUW          = tsUW b <|> tsUW a
                          , tsNotNormalize = tsNotNormalize b <|> tsNotNormalize a
                          , tsEsc         = tsEsc b <|> tsEsc a
                          , tsAddEmptyFields = tsAddEmptyFields b <|> tsAddEmptyFields a
                          , tsPunct = tsPunct b <> tsPunct a
                          , tsIndent = tsIndent b <|> tsIndent a
                          , tsItabStops = tsItabStops b <|> tsItabStops a
                          , tsKeywords = tsKeywords b <> tsKeywords a
                          }
 instance Monoid TokenizeSpec where
  mempty = TokenizeSpec { tsAtoms = mempty
                        , tsStringQQ = Nothing
                        , tsStringQ  = Nothing
                        , tsNoSlits = Nothing
                        , tsLineComment = mempty
                        , tsDelims = mempty
                        , tsEol = Nothing
                        , tsStripLeft = Nothing
                        , tsStripRight = Nothing
                        , tsUW = Nothing
                        , tsNotNormalize = Nothing
                        , tsEsc = Nothing
                        , tsAddEmptyFields = Nothing
                        , tsPunct = mempty
                        , tsIndent = Nothing
                        , tsItabStops = Nothing
                        , tsKeywords = mempty
                        }
 justTrue :: Maybe Bool -> Bool
 justTrue (Just True) = True
 justTrue _ = False
 -- | Turns on EOL token generation
 eol :: TokenizeSpec
 eol = mempty { tsEol = pure True }
 -- | Turn on character escaping inside string literals.
 -- Currently the following escaped characters are
 -- supported: [" ' \ t n r a b f v ]
 esc :: TokenizeSpec
 esc = mempty { tsEsc = pure True }
 -- | Raise empty field tokens (note mkEmpty method)
 -- when no tokens found before a delimeter.
 -- Useful for processing CSV-like data in
 -- order to distingush empty columns
 addEmptyFields :: TokenizeSpec
 addEmptyFields = mempty { tsAddEmptyFields = pure True }
 -- | same as addEmptyFields
 emptyFields :: TokenizeSpec
 emptyFields = addEmptyFields
 -- | Turns off token normalization. Makes the tokenizer
 -- generate character stream. Useful for debugging.
 nn :: TokenizeSpec
 nn = mempty { tsNotNormalize = pure True }
 -- | Turns on single-quoted string literals.
 -- Character stream after '\'' character
 -- will be proceesed as single-quoted stream,
 -- assuming all delimeter, comment and other special
 -- characters as a part of the string literal until
 -- the next unescaped single quote character.
 sq :: TokenizeSpec
 sq = mempty { tsStringQ = pure True }
 -- | Enable double-quoted string literals support
 -- as 'sq' for single-quoted strings.
 sqq :: TokenizeSpec
 sqq = mempty { tsStringQQ = pure True }
 -- | Disable separate string literals.
 --
 -- Useful when processed delimeted data (csv-like formats).
 -- Normally, sequential text chunks are concatenated together,
 -- but consequent text and string literal will produce the two
 -- different tokens and it may cause weird results if data
 -- is in csv-like format, i.e:
 --
 -- >>> tokenize (delims ":"<>emptyFields<>sq ) "aaa:bebe:'qq' aaa:next::" :: [Maybe Text]
 -- [Just "aaa",Just "bebe",Just "qq",Just " aaa",Just "next",Nothing,Nothing]
 --
 -- look: "qq" and " aaa" are turned into two separate tokens that makes the result
 -- of CSV processing looks improper, like it has an extra-column. This behavior may be
 -- avoided using this option, if you don't need to distinguish text chunks and string
 -- literals:
 --
 -- >>> tokenize (delims ":"<>emptyFields<>sq<>noslits) "aaa:bebe:'qq:foo' aaa:next::" :: [Maybe Text]
 -- [Just "aaa",Just "bebe",Just "qq:foo aaa",Just "next",Nothing,Nothing]
 --
 noslits :: TokenizeSpec
 noslits = mempty { tsNoSlits = pure True }
 -- | Specify the list of delimers (characters)
 -- to split the character stream into fields.  Useful for CSV-like separated formats.  Support for
 -- empty fields in token stream may be enabled by 'addEmptyFields' function
 delims :: String -> TokenizeSpec
 delims s = mempty { tsDelims = Set.fromList s }
 -- | Strip spaces on left side of a token.
 -- Does not affect string literals, i.e string are processed normally. Useful mostly for
 -- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
 sl :: TokenizeSpec
 sl = mempty { tsStripLeft = pure True }
 -- | Strip spaces on right side of a token.
 -- Does not affect string literals, i.e string are processed normally. Useful mostly for
 -- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
 sr :: TokenizeSpec
 sr = mempty { tsStripRight = pure True }
 -- | Strips spaces on right and left sides and transforms multiple spaces into the one.
 -- Name origins from  unwords . words
 --
 -- Does not affect string literals, i.e string are processed normally. Useful mostly for
 -- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
 uw :: TokenizeSpec
 uw = mempty { tsUW = pure True }
 -- | Specify the line comment prefix.
 -- All text after the line comment prefix will
 -- be ignored until the newline character appearance.
 -- Multiple line comments are supported.
 comment :: Text -> TokenizeSpec
 comment s = mempty { tsLineComment = cmt }
  where
    cmt = case Text.uncons s of
            Just (p,su) -> Map.singleton p su
            Nothing     -> mempty
 -- | Specify the punctuation characters.
 -- Any punctuation character is handled as a separate
 -- token.
 -- Any token will be breaked on a punctiation character.
 --
 -- Useful for handling ... er... punctuaton, like
 --
 -- >> function(a,b)
 --
 -- or
 --
 -- >> (apply function 1 2 3)
 --
 --
 -- >>> tokenize spec "(apply function 1 2 3)" :: [Text]
 -- ["(","apply","function","1","2","3",")"]
 --
 punct :: Text -> TokenizeSpec
 punct s = mempty { tsPunct = Set.fromList (Text.unpack s) }
 -- | Specify the keywords list.
 -- Each keyword will be threated as a separate token.
 keywords :: [Text] -> TokenizeSpec
 keywords s = mempty { tsKeywords = Set.fromList s }
 -- | Enable identation support
 indent :: TokenizeSpec
 indent = mempty { tsIndent = Just True }
 -- | Set tab expanding multiplier
 -- i.e. each tab extends into n spaces before processing.
 -- It also turns on the indentation. Only the tabs at the beginning of the string are expanded,
 -- i.e. before the first non-space character appears.
 itabstops :: Int -> TokenizeSpec
 itabstops n = mempty { tsIndent = Just True, tsItabStops = pure n }
 newtype TokenizeM w a = TokenizeM (RWS TokenizeSpec w () a)
                        deriving( Applicative
                                , Functor
                                , MonadReader TokenizeSpec
                                , MonadWriter w
                                , MonadState  ()
                                , Monad
                                )
 data Token = TChar Char
           | TSChar Char
           | TPunct Char
           | TText Text
           | TSLit Text
           | TKeyword Text
           | TEmpty
           | TDelim
           | TIndent Int
           | TEol
           deriving (Eq,Ord,Show)
 -- | Typeclass for token values.
 -- Note, that some tokens appear in results
 -- only when 'nn' option is set, i.e. sequences
 -- of characters turn out to text tokens or string literals
 -- and delimeter tokens are just removed from the
 -- results
 class IsToken a where
  -- |  Create a character token
  mkChar   :: Char -> a
  -- | Create a string literal character token
  mkSChar  :: Char -> a
  -- | Create a punctuation token
  mkPunct  :: Char -> a
  -- | Create a text chunk token
  mkText   :: Text -> a
  -- | Create a string literal token
  mkStrLit :: Text -> a
  -- | Create a keyword token
  mkKeyword :: Text -> a
  -- | Create an empty field token
  mkEmpty  :: a
  -- | Create a delimeter token
  mkDelim  :: a
  mkDelim = mkEmpty
  -- | Creates an indent token
  mkIndent :: Int -> a
  mkIndent = const mkEmpty
  -- | Creates an EOL token
  mkEol :: a
  mkEol = mkEmpty
 instance IsToken (Maybe Text) where
  mkChar = pure . Text.singleton
  mkSChar = pure . Text.singleton
  mkPunct = pure . Text.singleton
  mkText = pure
  mkStrLit = pure
  mkKeyword = pure
  mkEmpty = Nothing
 instance IsToken Text where
  mkChar   = Text.singleton
  mkSChar  = Text.singleton
  mkPunct  = Text.singleton
  mkText   = id
  mkStrLit = id
  mkKeyword = id
  mkEmpty  = ""
 -- | Tokenize a text
 tokenize :: IsToken a => TokenizeSpec -> Text -> [a]
 tokenize s t = map tr t1
  where
    t1 = tokenize' s t
    tr (TChar c) = mkChar c
    tr (TSChar c) = mkSChar c
    tr (TText c) = mkText c
    tr (TSLit c) = mkStrLit c
    tr (TKeyword c) = mkKeyword c
    tr TEmpty  = mkEmpty
    tr (TPunct c) = mkPunct c
    tr TDelim  = mkDelim
    tr (TIndent n) = mkIndent n
    tr TEol = mkEol
 execTokenizeM :: TokenizeM [Token] a -> TokenizeSpec -> [Token]
 execTokenizeM (TokenizeM m) spec =
  let (_,w) = execRWS m spec () in norm w
  where norm x | justTrue (tsNotNormalize spec) = x
               | otherwise = normalize spec x
 tokenize' :: TokenizeSpec -> Text -> [Token]
 tokenize' spec txt = execTokenizeM (root' txt) spec
  where
    r = spec
    noIndent = not doIndent
    doIndent = justTrue (tsIndent r)
    eolOk = justTrue (tsEol r)
    root' x = scanIndent x >>= root
    root ts = do
      case Text.uncons ts of
        Nothing           -> pure ()
        Just ('\n', rest) | doIndent                  -> raiseEol >> root' rest
        Just (c, rest)    | Set.member c (tsDelims r) -> tell [TDelim]  >> root rest
        Just ('\'', rest) | justTrue (tsStringQ r)    -> scanQ '\'' rest
        Just ('"', rest)  | justTrue (tsStringQQ r)   -> scanQ '"' rest
        Just (c, rest)    | Map.member c (tsLineComment r) -> scanComment (c,rest)
        Just (c, rest)    | Set.member c (tsPunct r)  -> tell [TPunct c] >> root rest
        Just (c, rest)    | otherwise                 -> tell [TChar c] >> root rest
    raiseEol | eolOk = tell [TEol]
             | otherwise = pure ()
    expandSpace ' '  = 1
    expandSpace '\t' = (fromMaybe 8 (tsItabStops r))
    expandSpace _    = 0
    scanIndent x | noIndent = pure x
                 | otherwise = do
      let (ss,as) = Text.span (\c -> c == ' ' || c == '\t') x
      tell [ TIndent (sum (map expandSpace (Text.unpack ss))) ]
      pure as
    scanComment (c,rest) = do
      suff <- Map.lookup c <$> asks tsLineComment
      case suff of
        Just t | Text.isPrefixOf t rest -> do
           root $ Text.dropWhile ('\n' /=) rest
        _  -> tell [TChar c] >> root rest
    scanQ q ts = do
      case Text.uncons ts of
        Nothing           -> root ts
        Just ('\\', rest) | justTrue (tsEsc r) -> unesc (scanQ q) rest
                          | otherwise          -> tell [tsChar '\\'] >> scanQ q rest
        Just (c, rest) | c ==  q   -> root rest
                       | otherwise -> tell [tsChar c] >> scanQ q rest
    unesc f ts =
      case Text.uncons ts of
        Nothing -> f ts
        Just ('"', rs)  -> tell [tsChar '"' ]  >> f rs
        Just ('\'', rs) -> tell [tsChar '\''] >> f rs
        Just ('\\', rs) -> tell [tsChar '\\'] >> f rs
        Just ('t', rs)  -> tell [tsChar '\t'] >> f rs
        Just ('n', rs)  -> tell [tsChar '\n'] >> f rs
        Just ('r', rs)  -> tell [tsChar '\r'] >> f rs
        Just ('a', rs)  -> tell [tsChar '\a'] >> f rs
        Just ('b', rs)  -> tell [tsChar '\b'] >> f rs
        Just ('f', rs)  -> tell [tsChar '\f'] >> f rs
        Just ('v', rs)  -> tell [tsChar '\v'] >> f rs
        Just (_, rs)    -> f rs
    tsChar c | justTrue (tsNoSlits spec) = TChar c
             | otherwise = TSChar c
 newtype NormStats = NormStats { nstatBeforeDelim :: Int }
 normalize :: TokenizeSpec -> [Token] -> [Token]
 normalize spec tokens = snd $ execRWS (go tokens) () init
  where
    go [] = addEmptyField
    go s@(TIndent _ : _) = do
      let (iis, rest') = List.span isIndent s
      tell [TIndent (sum [k | TIndent k <- iis])]
      go rest'
    go (TChar c0 : cs) = do
      let (n,ns) = List.span isTChar cs
      succStat
      let chunk = eatSpaces $ Text.pack (c0 : [ c | TChar c <- n])
      let kw = Set.member chunk (tsKeywords spec)
      tell [ if kw then TKeyword chunk else TText chunk ]
      go ns
    go (TSChar x : xs) = do
      let (n,ns) = List.span isTSChar xs
      succStat
      tell [ TSLit $ Text.pack (x : [ c | TSChar c <- n]) ]
      go ns
    go (TDelim : xs) = do
      addEmptyField
      pruneStat
      go xs
    go (TPunct c : xs) = do
      tell [ TPunct c ]
      succStat
      go xs
    go (x:xs) = tell [x] >> go xs
    succStat = do
      modify (\x -> x { nstatBeforeDelim = succ (nstatBeforeDelim x)})
    pruneStat = do
      modify (\x -> x { nstatBeforeDelim = 0 } )
    addEmptyField = do
      ns <- gets nstatBeforeDelim
      when  (ns == 0 && justTrue (tsAddEmptyFields spec) ) $ do
        tell [ TEmpty ]
    isTChar (TChar _) = True
    isTChar _         = False
    isTSChar (TSChar _) = True
    isTSChar _          = False
    isIndent (TIndent _) = True
    isIndent _           = False
    init = NormStats { nstatBeforeDelim = 0 }
    eatSpaces s | sboth  = Text.strip s
                | sLonly = Text.stripStart s
                | sRonly = Text.stripEnd s
                | sWU    = (Text.unwords . Text.words) s
                | otherwise = s
      where sboth  = justTrue (tsStripLeft spec) && justTrue (tsStripRight spec)
            sLonly = justTrue (tsStripLeft spec) && not (justTrue (tsStripRight spec))
            sRonly = not (justTrue (tsStripLeft spec)) && justTrue (tsStripRight spec)
            sWU    = justTrue (tsUW spec)
--- a/src/Data/Text/LSH.hs
+++ b/src/Data/Text/LSH.hs
@ -0,0 +1 @@
 module Data.Text.LSH where
--- a/test/FuzzyParseSpec.hs
+++ b/test/FuzzyParseSpec.hs
@ -0,0 +1,150 @@
 {-# LANGUAGE   OverloadedStrings
             , QuasiQuotes
             , ExtendedDefaultRules
             , LambdaCase
             , ImportQualifiedPost
             , DerivingStrategies
             , PatternSynonyms
             , ViewPatterns
             , MultiWayIf
             , TemplateHaskell
 #-}
 module FuzzyParseSpec (spec) where
 import Data.Text (Text)
 import Test.Hspec
 import Text.InterpolatedString.Perl6 (q)
 import Data.Text.Fuzzy.Tokenize
 import Data.Data
 import Data.Generics.Uniplate.Data()
 import GHC.Generics
 data TTok = TChar Char
          | TSChar Char
          | TPunct Char
          | TText Text
          | TStrLit Text
          | TKeyword Text
          | TEmpty
          | TIndent Int
          deriving stock (Eq,Ord,Show,Data,Generic)
 instance IsToken TTok where
  mkChar = TChar
  mkSChar = TSChar
  mkPunct = TPunct
  mkText = TText
  mkStrLit = TStrLit
  mkKeyword = TKeyword
  mkEmpty = TEmpty
  mkIndent = TIndent
 spec :: Spec
 spec = do
  describe "csv-like" $ do
    it "splits text using ':' delimeter" $ do
      let toks = tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
      toks `shouldBe` ["aaa "," bebeb "," qqq "]
    it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
      let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
      toks `shouldBe` ["aaa "," bebeb "," qqq ","","","",""]
    it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
      let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
      toks `shouldBe` [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
    it "splits text using ':' delimeter with single-quotes string and empty fields with noslits" $ do
      let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
      let toks =  tokenize spec "   aaa :   'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
      toks `shouldBe` [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
    it "splits text using ':' delimeter with single-quotes string and empty fields with noslits and uw" $ do
      let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
      let toks = tokenize spec "  a  b  c  : 'bebeb:colon inside' : qqq ::::"  :: [Maybe Text]
      toks `shouldBe` [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
    it "uses punctuation tokens" $ do
      let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
      let toks = tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
      toks `shouldBe` ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
    it "tokenize simple lisp-like text with keywords" $ do
      let spec = delims " \n\t" <> comment ";"
                                <> punct "{}()[]<>"
                                <> sq <> sqq
                                <> uw
                                <> keywords ["define","apply","+"]
      let code = [q|
        (define add (a b ) ; define simple function
          (+ a b) )
        (define r (add 10 20))
 |]
      let toks = tokenize spec code :: [TTok]
      let expected = [ TPunct '('
                     , TKeyword "define"
                     , TText "add" , TPunct '(', TText "a" , TText "b", TPunct ')'
                     , TPunct '(', TKeyword "+", TText "a",TText "b",TPunct ')',TPunct ')'
                     , TPunct '(',TKeyword "define"
                                  ,TText "r"
                                  ,TPunct '(',TText "add",TText "10",TText "20"
                                  ,TPunct ')',TPunct ')']
      toks `shouldBe` expected
    describe "Checks indentation support" $ do
      let spec = delims " \n\t" <> comment ";"
                                <> punct "{}()[]<>"
                                <> sq <> sqq
                                <> uw
                                <> indent
                                <> itabstops 8
                                <> keywords ["define"]
      it "parses some indented blocks" $ do
        let expected = [ TIndent 0, TKeyword "define", TText "a", TText "0"
                       , TIndent 2, TText "atom", TText "foo", TText "2"
                       , TIndent 2, TKeyword "define", TText "aq", TText "2"
                       , TIndent 4, TText "atom", TText "one", TText "4"
                       , TIndent 4, TText "atom", TText "two", TText "4"
                       , TIndent 0, TKeyword "define", TText "b", TText "0"
                       , TIndent 2, TText "atom", TText "baar", TText "2"
                       , TIndent 2, TText "atom", TText "quux", TText "2"
                       , TIndent 2, TKeyword "define", TText "new", TText "2"
                       , TIndent 6, TText "atom", TText "bar", TText "6"
                       , TIndent 4, TText "atom", TText "fuu", TText "4"
                       , TIndent 0
                       ]
        let pyLike = [q|
 define a      0
  atom foo    2
  define aq   2
    atom one  4
    atom two  4
 define  b       0
  atom baar     2
  atom quux     2
  define new    2
      atom bar  6
    atom fuu    4
 |]
        let toks = tokenize spec pyLike :: [TTok]
        toks `shouldBe` expected
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -0,0 +1 @@
 {-# OPTIONS_GHC -F -pgmF hspec-discover #-}