Squashed 'miscellaneous/fuzzy-parse/' content from commit a834b152e

git-subtree-dir: miscellaneous/fuzzy-parse git-subtree-split: a834b152e29d632c816eefe117036e5d9330bd03
2024-10-07 05:06:03 +03:00 · 2024-10-07 05:06:03 +03:00 · cf85d2df2a
commit cf85d2df2a
26 changed files with 1854 additions and 0 deletions
--- a/.envrc
+++ b/.envrc
@ -0,0 +1,5 @@
+if [ -f .envrc.local ]; then
+  source_env .envrc.local
+fi
+
+use flake
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
+*.swp
+dist-newstyle/
+Setup.hs
+
+.direnv
+.hbs2-git/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,10 @@
+# Revision history for fuzzy-parse
+
+## 0.1.2.0
+
+ - Techical release
+ - Added some missed things
+
+## 0.1.0.0 -- YYYY-mm-dd
+
+* First version. Released on an unsuspecting world.
--- a/20
+++ b/20
@ -0,0 +1,20 @@
+Copyright (c) 2019 Dmitry Zuikov
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.markdown
+++ b/README.markdown
@ -0,0 +1,131 @@
+# About
+
+# Data.Text.Fuzzy.Tokenize
+
+The lightweight and multi-functional text tokenizer allowing different types of text tokenization
+depending on it's settings.
+
+It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
+easier and sometimes faster than in case of usage mainstream parsing combinators or parser
+generators.
+
+The primary goal of this package  is to parse unstructured text data, however it may be  used for
+parsing  such data formats as CSV with ease.
+
+Currently it supports the following types of entities: atoms, string literals (currently with the
+minimal set of escaped characters), punctuation characters and delimeters.
+
+## Examples
+
+### Simple CSV-like tokenization
+
+```haskell
+tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
+
+["aaa "," bebeb "," qqq "]
+```
+
+```haskell
+tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
+
+["aaa "," bebeb "," qqq ","","","",""]
+```
+
+```haskell
+tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
+
+[Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
+```
+
+```haskell
+tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
+
+[Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
+```
+
+```haskell
+let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
+tokenize spec "   aaa :   'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
+
+[Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
+```
+
+```haskell
+let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
+tokenize spec "  a  b  c  : 'bebeb:colon inside' : qqq ::::"  :: [Maybe Text]
+
+[Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
+```
+
+### Primitive lisp-like language
+
+```haskell
+{-# LANGUAGE QuasiQuotes, ExtendedDefaultRules #-}
+
+import Text.InterpolatedString.Perl6 (q)
+import Data.Text.Fuzzy.Tokenize
+
+data TTok = TChar Char
+          | TSChar Char
+          | TPunct Char
+          | TText Text
+          | TStrLit Text
+          | TKeyword Text
+          | TEmpty
+          deriving(Eq,Ord,Show)
+
+instance IsToken TTok where
+ mkChar = TChar
+ mkSChar = TSChar
+ mkPunct = TPunct
+ mkText = TText
+ mkStrLit = TStrLit
+ mkKeyword = TKeyword
+ mkEmpty = TEmpty
+
+main = do
+
+   let spec = delims " \n\t" <> comment ";"
+                             <> punct "{}()[]<>"
+                             <> sq <> sqq
+                             <> uw
+                             <> keywords ["define","apply","+"]
+     let code = [q|
+       (define add (a b ) ; define simple function
+         (+ a b) )
+       (define r (add 10 20))
+|]
+     let toks = tokenize spec code :: [TTok]
+
+   print toks
+```
+
+## Notes
+
+### About the delimeter tokens
+This type of tokens appears during a "delimited" formats processing and disappears in results.
+Currenly you will never see it unless normalization is turned off by 'nn' option.
+
+The delimeters make sense in case of processing the CSV-like formats, but in this case you probably
+need only values in results.
+
+This behavior may be changed later. But right now delimeters seem pointless in results. If you
+process some sort of grammar where delimeter character is important, you may use punctuation
+instead, i.e:
+
+```haskell
+let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
+tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
+
+["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
+```
+
+### Other
+For CSV-like formats it makes sense to split text to lines first, otherwise newline characters may
+cause to weird results
+
+
+# Authors
+
+This library is written and maintained by Dmitry Zuikov, dzuikov@gmail.com
+
--- a/12
+++ b/12
@ -0,0 +1,12 @@
+
+
+- [ ] TODO: Tests for Data.Text.Fuzzy.Section
+- [ ] TODO: haddock for Data.Text.Fuzzy.Section
+
+- [~] TODO: Tests
+- [+] TODO: Freeze dependencies versions
+- [ ] TODO: Version number
+- [+] TODO: Haddocks
+- [ ] TODO: Tokenizer: Identation support
+- [ ] TODO: Tokenizer: Block comments support
+
--- a/cabal.project
+++ b/cabal.project
@ -0,0 +1,6 @@
+packages: *.cabal
+
+allow-newer: all
+
+tests: True
+
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,61 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "locked": {
+        "lastModified": 1644229661,
+        "narHash": "sha256-1YdnJAsNy69bpcjuoKdOYQX0YxZBiCYZo4Twxerqv7k=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "3cecb5b042f7f209c56ffd8371b2711a290ec797",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "haskell-flake-utils": {
+      "inputs": {
+        "flake-utils": "flake-utils"
+      },
+      "locked": {
+        "lastModified": 1707809372,
+        "narHash": "sha256-wfTL9PlCSOqSSyU4eenFFI7pHrV21gba4GEILnI4nAU=",
+        "owner": "ivanovs-4",
+        "repo": "haskell-flake-utils",
+        "rev": "3cbdc5d6093e8b4464ae64097e0c8c61e4414ff2",
+        "type": "github"
+      },
+      "original": {
+        "owner": "ivanovs-4",
+        "repo": "haskell-flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1727089097,
+        "narHash": "sha256-ZMHMThPsthhUREwDebXw7GX45bJnBCVbfnH1g5iuSPc=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "568bfef547c14ca438c56a0bece08b8bb2b71a9c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixpkgs-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "haskell-flake-utils": "haskell-flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,69 @@
+{
+description = "Haskell cabal package";
+
+inputs = {
+    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
+    haskell-flake-utils.url = "github:ivanovs-4/haskell-flake-utils";
+
+    # another-simple-haskell-flake.url = "something";
+
+    # some-cabal-pkg.url = "github:example/some-cabal-pkg";
+    # some-cabal-pkg.flake = false;
+};
+
+outputs = { self, nixpkgs, haskell-flake-utils, ... }@inputs:
+    haskell-flake-utils.lib.simpleCabal2flake {
+      inherit self nixpkgs;
+      # systems = [ "x86_64-linux" ];
+
+      # DON'T FORGET TO PUT YOUR PACKAGE NAME HERE, REMOVING `throw`
+      name = "fuzzy-parse";
+
+      shellExtBuildInputs = {pkgs}: with pkgs; [
+        haskellPackages.haskell-language-server
+      ];
+
+      # Wether to build hoogle in the default shell
+      shellWithHoogle = true;
+
+
+      ## Optional parameters follow
+
+      # nixpkgs config
+      # config = { };
+
+      # Add another haskell flakes as requirements
+      # haskellFlakes = [ inputs.another-simple-haskell-flake ];
+
+      # Use this to load other flakes overlays to supplement nixpkgs
+      # preOverlays = [ ];
+
+      # Pass either a function or a file
+      # preOverlay = ./overlay.nix;
+
+      # Override haskell packages
+      # hpPreOverrides = { pkgs }: new: old:
+      #   with pkgs.haskell.lib; with haskell-flake-utils.lib;
+      #   tunePackages pkgs old {
+      #     some-haskellPackages-package = [ dontHaddock ];
+      #   } // {
+      #     some-cabal-pkg = ((jailbreakUnbreak pkgs) (dontCheck (old.callCabal2nix "some-cabal-pkg" inputs.some-cabal-pkg {})));
+      #   };
+
+      # Arguments for callCabal2nix
+      # cabal2nixArgs = {pkgs}: {
+      # };
+
+      # Maps to the devShell output. Pass in a shell.nix file or function
+      # shell = ./shell.nix
+
+      # Additional build intputs of the default shell
+      # shellExtBuildInputs = {pkgs}: with pkgs; [
+      #   haskellPackages.haskell-language-server
+      # ];
+
+      # Wether to build hoogle in the default shell
+      # shellWithHoogle = true;
+
+    };
+}
--- a/fuzzy-parse.cabal
+++ b/fuzzy-parse.cabal
@ -0,0 +1,160 @@
+cabal-version:       3.0
+
+name:                fuzzy-parse
+version:             0.1.3.1
+synopsis:            Tools for processing unstructured text data
+
+description:
+  The lightweight and easy to use functions for text tokenizing and parsing.  It aimed for
+  parsing mostly unstructured data, but the structured formats may be parsed as well.
+
+  It may be used in different sutiations, for DSL, tex markups or even for parsing simple
+  grammars easier and sometimes faster than in case of usage mainstream parsing combinators
+  or parser generators.
+
+  See the README.markdown, examples and modules documentation for more.
+
+license:             MIT
+license-file:        LICENSE
+author:              Dmitry Zuikov
+maintainer:          dzuikov@gmail.com
+
+category:            Text, Parsing
+extra-source-files:  CHANGELOG.md
+
+homepage:            https://github.com/hexresearch/fuzzy-parse
+bug-reports:         https://github.com/hexresearch/fuzzy-parse/issues
+
+extra-source-files:
+    README.markdown
+
+common shared-properties
+
+  default-language:    GHC2021
+
+  default-extensions:
+                       ApplicativeDo
+                     , BangPatterns
+                     , BlockArguments
+                     , ConstraintKinds
+                     , DataKinds
+                     , DeriveDataTypeable
+                     , DeriveGeneric
+                     , DerivingStrategies
+                     , DerivingVia
+                     , ExtendedDefaultRules
+                     , FlexibleContexts
+                     , FlexibleInstances
+                     , GADTs
+                     , GeneralizedNewtypeDeriving
+                     , ImportQualifiedPost
+                     , LambdaCase
+                     , MultiParamTypeClasses
+                     , OverloadedStrings
+                     , QuasiQuotes
+                     , RecordWildCards
+                     , ScopedTypeVariables
+                     , StandaloneDeriving
+                     , TemplateHaskell
+                     , TupleSections
+                     , TypeApplications
+                     , TypeFamilies
+
+
+
+library
+
+  import:           shared-properties
+
+  ghc-options:
+        -Wall
+        -fno-warn-type-defaults
+        -O2
+        "-with-rtsopts=-N4 -A64m -AL256m -I0"
+
+
+  exposed-modules:     Data.Text.Fuzzy.Tokenize
+                     , Data.Text.Fuzzy.Dates
+                     , Data.Text.Fuzzy.Section
+                     , Data.Text.Fuzzy.SExp
+                     , Data.Text.Fuzzy.Attoparsec.Day
+                     , Data.Text.Fuzzy.Attoparsec.Month
+
+  build-depends:       base
+                     , attoparsec
+                     , containers
+                     , mtl
+                     , prettyprinter
+                     , safe
+                     , streaming
+                     , scientific
+                     , text
+                     , time
+                     , microlens-platform
+                     , uniplate
+                     , unliftio
+                     , unordered-containers
+                     , timeit
+
+  hs-source-dirs:      src
+
+
+executable fuzzy-sexp-parse
+
+    import:           shared-properties
+    default-language:   GHC2021
+
+    ghc-options:
+          -Wall
+          -fno-warn-type-defaults
+          -O2
+
+
+    main-is: FuzzySexpParse.hs
+
+    hs-source-dirs: misc
+
+    build-depends: base, fuzzy-parse
+                 , containers
+                 , hspec
+                 , hspec-discover
+                 , interpolatedstring-perl6
+                 , text
+                 , mtl
+                 , streaming
+                 , transformers
+                 , exceptions
+                 , uniplate
+                 , microlens-platform
+                 , safe
+                 , timeit
+                 , prettyprinter
+
+
+test-suite fuzzy-parse-test
+    import:           shared-properties
+    default-language:   GHC2021
+
+
+    type:       exitcode-stdio-1.0
+    main-is:    Spec.hs
+    other-modules: FuzzyParseSpec
+    hs-source-dirs: test
+    build-depends: base, fuzzy-parse
+                 , containers
+                 , hspec
+                 , hspec-discover
+                 , interpolatedstring-perl6
+                 , text
+                 , mtl
+                 , streaming
+                 , transformers
+                 , exceptions
+                 , uniplate
+                 , microlens-platform
+                 , safe
+                 , timeit
+
+    build-tool-depends: hspec-discover:hspec-discover == 2.*
+
+
--- a/misc/FuzzySexpParse.hs
+++ b/misc/FuzzySexpParse.hs
@ -0,0 +1,30 @@
+module Main where
+
+import Data.Text.Fuzzy.SExp
+import Data.Text.IO qualified as IO
+import Data.Text qualified as Text
+import Data.Either
+import System.TimeIt
+import Control.Monad.Except
+import Data.Functor
+import Data.Function
+import Data.Fixed
+import Prettyprinter
+import System.IO
+
+main :: IO ()
+main = do
+  s <- IO.getContents
+
+  (tt,toks) <- timeItT do
+                pure (tokenizeSexp s)
+
+  (pt,top) <- timeItT  do
+                runExceptT (parseTop @() s) <&> either (error.show) id
+
+  print (vcat (fmap pretty top))
+
+  hPrint stderr $ pretty (Text.length s) <+> "chars, parsed in" <+> viaShow (realToFrac pt  :: Fixed E6)
+
+
+
--- a/nix/derivations/.gitignore
+++ b/nix/derivations/.gitignore
--- a/nix/pkgs.json
+++ b/nix/pkgs.json
@ -0,0 +1,7 @@
+{
+  "url": "https://github.com/NixOS/nixpkgs.git",
+  "rev": "5cf0de2485efeccc307692eedadbb2d9bfdc7013",
+  "date": "2020-06-04T17:30:37+08:00",
+  "sha256": "07axrr50nlmnvba5ja2ihzjwczi66znak57bhcz472w22w7m3sd1",
+  "fetchSubmodules": false
+}
--- a/nix/pkgs.nix
+++ b/nix/pkgs.nix
@ -0,0 +1,5 @@
+import ((import <nixpkgs> {}).fetchFromGitHub {
+  owner = "NixOS";
+  repo  = "nixpkgs";
+  inherit (builtins.fromJSON (builtins.readFile ./pkgs.json)) rev sha256;
+})
--- a/nix/release.nix
+++ b/nix/release.nix
@ -0,0 +1,29 @@
+let
+  pkgs = import ./pkgs.nix { inherit config;
+                           };
+  lib  = pkgs.haskell.lib;
+  config = {
+    packageOverrides = pkgs: rec {
+      haskellPackages = pkgs.haskellPackages.override { overrides = haskOverrides; };
+    };
+  };
+  gitignore = pkgs.callPackage (pkgs.fetchFromGitHub {
+    owner  = "siers";
+    repo   = "nix-gitignore";
+    rev    = "ce0778ddd8b1f5f92d26480c21706b51b1af9166";
+    sha256 = "1d7ab78i2k13lffskb23x8b5h24x7wkdmpvmria1v3wb9pcpkg2w";
+  }) {};
+  ignore = gitignore.gitignoreSourceAux ''
+    .stack-work
+    dist
+    dist-newstyle
+    .ghc.environment*
+    '';
+  haskOverrides = new: old:
+    let overrides = lib.packagesFromDirectory { directory = ./derivations; } new old;
+    in overrides;
+in rec {
+  inherit pkgs;
+  packages = { inherit (pkgs.haskellPackages) fuzzy-parse;
+             };
+}
--- a/nix/update-nixpkgs.sh
+++ b/nix/update-nixpkgs.sh
@ -0,0 +1,3 @@
+#!/usr/bin/env nix-shell
+#! nix-shell -i bash -p nix-prefetch-git
+nix-prefetch-git https://github.com/NixOS/nixpkgs.git | tee pkgs.json
--- a/scripts/upload-docs.sh
+++ b/scripts/upload-docs.sh
@ -0,0 +1,10 @@
+#!/bin/sh
+set -e
+
+dir=$(mktemp -d dist-docs.XXXXXX)
+trap 'rm -r "$dir"' EXIT
+
+# assumes cabal 2.4 or later
+cabal v2-haddock --builddir="$dir" --haddock-for-hackage --enable-doc
+
+cabal upload -d --publish $dir/*-docs.tar.gz
--- a/src/Data/Text/Fuzzy/Attoparsec/Day.hs
+++ b/src/Data/Text/Fuzzy/Attoparsec/Day.hs
@ -0,0 +1,97 @@
+module Data.Text.Fuzzy.Attoparsec.Day (  dayDMY
+                                       , dayYMD
+                                       , dayYYYYMMDD
+                                       , dayDMonY
+                                       , day
+                                       ) where
+
+import Data.List (zipWith)
+import Control.Applicative ((<|>))
+import Data.Attoparsec.Text (Parser,decimal,digit,count,satisfy,inClass,skipWhile)
+import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
+import qualified Data.Char as Char
+import qualified Data.Text as Text
+
+
+
+day :: Parser Day
+day = dayDMonY <|> dayYYYYMMDD <|> dayYMD <|> dayDMY
+
+skipDelim :: Parser ()
+skipDelim = skipWhile (inClass " ./-")
+
+dayDMY :: Parser Day
+dayDMY = do
+  d <- decimal :: Parser Int
+  skipDelim
+  m <- decimal :: Parser Int
+  skipDelim
+  y' <- decimal :: Parser Integer
+  maybe (fail "bad date format") pure (makeDay y' m d)
+
+dayYMD :: Parser Day
+dayYMD = do
+  y' <- decimal :: Parser Integer
+  skipDelim
+  m <- decimal :: Parser Int
+  skipDelim
+  d <- decimal :: Parser Int
+  maybe (fail "bad date format") pure (makeDay y' m d)
+
+dayYYYYMMDD :: Parser Day
+dayYYYYMMDD = do
+  y <- fromIntegral . num n4 . map o <$> count 4 digit
+  m <- num n2 . map o <$> count 2 digit
+  d <- num n2 . map o <$> count 2 digit
+  maybe (fail "bad date format") pure (makeDay y m d)
+  where n4 = [1000,100,10,1]
+        n2 = [10,1]
+        o x = Char.ord x - Char.ord '0'
+        num n x = sum $ zipWith (*) x n
+
+dayDMonY :: Parser Day
+dayDMonY = do
+  d <- decimal :: Parser Int
+  skipDelim
+  m <- pMon
+  skipDelim
+  y <- decimal :: Parser Integer
+  maybe (fail "bad date format") pure (makeDay y m d)
+  where
+    pMon :: Parser Int
+    pMon = do
+      txt <- Text.toUpper . Text.pack  <$> count 3 (satisfy Char.isLetter)
+      case txt of
+        "JAN" -> pure 1
+        "FEB" -> pure 2
+        "MAR" -> pure 3
+        "APR" -> pure 4
+        "MAY" -> pure 5
+        "JUN" -> pure 6
+        "JUL" -> pure 7
+        "AUG" -> pure 8
+        "SEP" -> pure 9
+        "OCT" -> pure 10
+        "NOV" -> pure 11
+        "DEC" -> pure 12
+        _     -> fail "bad month name"
+
+
+makeYear :: Integer -> Maybe Integer
+makeYear y' = if y < 1900 && y' < 99
+                then Nothing
+                else pure y
+  where
+    y = if y' < 50
+          then y' + 2000
+          else (if y' >= 50 && y' <= 99
+                  then y' + 1900
+                  else y' )
+
+makeDay :: Integer -> Int -> Int -> Maybe Day
+makeDay y m d | m <= 12 && m > 0 =
+  makeYear y >>= \yyyy -> if d <= gregorianMonthLength yyyy m
+                            then pure $ fromGregorian yyyy m d
+                            else Nothing
+
+              | otherwise = Nothing
--- a/src/Data/Text/Fuzzy/Attoparsec/Month.hs
+++ b/src/Data/Text/Fuzzy/Attoparsec/Month.hs
@ -0,0 +1,46 @@
+module Data.Text.Fuzzy.Attoparsec.Month ( fuzzyMonth, fuzzyMonthFromText
+                                        ) where
+
+import Control.Applicative ((<|>))
+import Data.Attoparsec.Text (Parser,decimal,digit,letter,many1,parseOnly)
+import Data.Map (Map)
+import Data.Maybe
+import Data.Text (Text)
+import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
+import qualified Data.Char as Char
+import qualified Data.Map as Map
+import qualified Data.Text as Text
+
+
+fuzzyMonth :: Parser Int
+fuzzyMonth  = pMonthNum <|> pMonth
+
+fuzzyMonthFromText :: Text -> Maybe Int
+fuzzyMonthFromText = either (const Nothing) Just . parseOnly fuzzyMonth
+
+pMonthNum :: Parser Int
+pMonthNum = do
+  n <- decimal
+  if n >= 1 && n <= 13
+    then pure n
+    else fail "invalid months num"
+
+pMonth :: Parser Int
+pMonth = do
+  mo <- many1 (Char.toLower <$> letter)
+  maybe (fail "invalid month name") pure (Map.lookup mo months)
+  where
+    months :: Map String Int
+    months = Map.fromList [ ("jan",  1), ("january"  ,  1)
+                          , ("feb",  2), ("febuary"  ,  2)
+                          , ("mar",  3), ("march"    ,  3)
+                          , ("apr",  4), ("april"    ,  4)
+                          , ("may",  5), ("may"      ,  5)
+                          , ("jun",  6), ("june"     ,  6)
+                          , ("jul",  7), ("july"     ,  7)
+                          , ("aug",  8), ("august"   ,  8)
+                          , ("sep",  9), ("september",  9)
+                          , ("oct", 10), ("october"  , 10)
+                          , ("nov", 11), ("november" , 11)
+                          , ("dec", 12), ("december" , 12)
+                          ]
--- a/src/Data/Text/Fuzzy/Dates.hs
+++ b/src/Data/Text/Fuzzy/Dates.hs
@ -0,0 +1,46 @@
+-- |
+-- Module      :  Data.Text.Fuzzy.Dates
+-- Copyright   :  Dmitry Zuikov 2020
+-- License     :  MIT
+--
+-- Maintainer  :  dzuikov@gmail.com
+-- Stability   :  experimental
+-- Portability :  unknown
+--
+-- Dates fuzzy parsing.
+-- Supports a number of dates format and tries to recover
+-- the incomplete dates from text, with use of some
+-- reasonable assumptions. Does not support locales,
+-- i.e assums only English for dates yet.
+--
+-- == Examples
+--
+-- > parseMaybeDay "01.01.1979"
+-- > Just 1979-01-01
+-- > parseMaybeDay "01.01.01"
+-- > Just 2001-01-01
+-- > parseMaybeDay "13/01/2019"
+-- > Just 2019-01-13
+-- > parseMaybeDay "2019-12-1"
+-- > Just 2019-12-01
+-- > parseMaybeDay "21-feb-79"
+-- > Just 1979-02-21
+-- > parseMaybeDay "21-feb-01"
+-- > Just 2001-02-21
+-- > parseMaybeDay "29feb04"
+-- > Just 2004-02-29
+-- > parseMaybeDay "21feb28"
+-- > Just 2028-02-21
+
+module Data.Text.Fuzzy.Dates where
+
+import Data.Attoparsec.Text (parseOnly)
+import Data.Either (either)
+import Data.Text.Fuzzy.Attoparsec.Day
+import Data.Text (Text)
+import Data.Time.Calendar
+
+-- | Tries to parse a date from the text.
+parseMaybeDay :: Text -> Maybe Day
+parseMaybeDay s = either (const Nothing) pure (parseOnly day s)
+
--- a/src/Data/Text/Fuzzy/SExp.hs
+++ b/src/Data/Text/Fuzzy/SExp.hs
@ -0,0 +1,378 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE ExtendedDefaultRules #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE ImportQualifiedPost #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE PatternSynonyms #-}
+{-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE TemplateHaskell #-}
+module Data.Text.Fuzzy.SExp where
+
+import Data.Text (Text)
+
+import Control.Applicative
+import Control.Monad
+import Data.Function
+import Data.Functor
+import Data.Text.Fuzzy.Tokenize
+import Control.Monad.Reader
+import Data.Typeable
+import Control.Monad.Except
+import Control.Monad.RWS
+import Data.Maybe
+import Data.Char (isSpace,digitToInt)
+import Data.Generics.Uniplate.Data()
+import Safe
+import Data.Data
+import GHC.Generics
+import Lens.Micro.Platform
+import Data.Text qualified as Text
+import Data.Coerce
+import Data.Scientific
+
+import Data.HashMap.Strict (HashMap)
+import Data.HashMap.Strict  qualified as HM
+
+import Prettyprinter hiding (braces,list)
+
+
+import Streaming.Prelude qualified as S
+
+data TTok = TChar Char
+          | TSChar Char
+          | TPunct Char
+          | TText Text
+          | TStrLit Text
+          | TKeyword Text
+          | TEmpty
+          | TIndent Int
+          deriving stock (Eq,Ord,Show,Data,Generic)
+
+instance IsToken TTok where
+  mkChar = TChar
+  mkSChar = TSChar
+  mkPunct = TPunct
+  mkText = TText
+  mkStrLit = TStrLit
+  mkKeyword = TKeyword
+  mkEmpty = TEmpty
+  mkIndent = TIndent
+
+newtype C0 = C0 (Maybe Int)
+             deriving stock (Eq,Ord,Show,Data,Typeable,Generic)
+
+data SExpParseError =
+    ParensOver  C0
+  | ParensUnder C0
+  | ParensUnmatched C0
+  | SyntaxError C0
+  deriving stock (Show,Typeable)
+
+
+data NumType =
+    NumInteger Integer
+  | NumDouble  Scientific
+  deriving stock (Eq,Ord,Show,Data,Generic)
+
+class Monoid c => ForMicroSexp c where
+
+instance Monoid C0 where
+  mempty = C0 Nothing
+
+instance Semigroup C0 where
+  (<>) (C0 a) (C0 b) = C0 (b <|> a)
+
+instance ForMicroSexp C0 where
+
+
+instance ForMicroSexp () where
+
+data MicroSexp c =
+    List_    c [MicroSexp c]
+  | Symbol_  c Text
+  | String_  c Text
+  | Number_  c NumType
+  | Boolean_ c Bool
+  deriving stock (Show,Data,Generic)
+
+pattern List :: ForMicroSexp c => [MicroSexp c] -> MicroSexp c
+pattern List xs <- List_ _ xs where
+  List xs = List_ mempty xs
+
+pattern Symbol :: ForMicroSexp c => Text -> MicroSexp c
+pattern Symbol xs <- Symbol_ _ xs where
+  Symbol xs = Symbol_ mempty xs
+
+pattern String :: ForMicroSexp c => Text -> MicroSexp c
+pattern String x <- String_ _ x where
+  String x = String_ mempty x
+
+pattern Number :: ForMicroSexp c => NumType -> MicroSexp c
+pattern Number n <- Number_ _ n where
+  Number n = Number_ mempty n
+
+pattern Boolean :: ForMicroSexp c => Bool -> MicroSexp c
+pattern Boolean b <- Boolean_ _ b where
+  Boolean b = Boolean_ mempty b
+
+{-# COMPLETE List, Symbol, String, Number, Boolean #-}
+
+
+contextOf :: Lens (MicroSexp c) (MicroSexp c) c c
+contextOf = lens g s
+  where
+    s sexp c = case sexp of
+      List_    _  a -> List_ c a
+      Symbol_  _  a -> Symbol_ c a
+      String_  _  a -> String_ c a
+      Number_  _  a -> Number_ c a
+      Boolean_ _  a -> Boolean_ c a
+
+    g = \case
+      List_ c    _ -> c
+      Symbol_ c  _ -> c
+      String_ c  _ -> c
+      Number_ c  _ -> c
+      Boolean_ c _ -> c
+
+nil :: forall c . ForMicroSexp c => MicroSexp c
+nil = List []
+
+symbol :: forall c . ForMicroSexp c => Text -> MicroSexp c
+symbol = Symbol
+
+str :: forall c . ForMicroSexp c => Text -> MicroSexp c
+str = String
+
+newtype SExpEnv =
+  SExpEnv
+  { sexpTranslate :: Bool
+  }
+
+data SExpState =
+  SExpState
+  { _sexpLno    :: Int
+  , _sexpBraces :: [Char]
+  }
+
+makeLenses 'SExpState
+
+defEnv :: SExpEnv
+defEnv = SExpEnv True
+
+newtype SExpM m a = SExpM { fromSexpM :: RWST SExpEnv () SExpState m a }
+                    deriving newtype
+                      ( Applicative
+                      , Functor
+                      , Monad
+                      , MonadState SExpState
+                      , MonadReader SExpEnv
+                      , MonadTrans
+                      )
+
+
+instance MonadError SExpParseError m => MonadError SExpParseError (SExpM m) where
+  throwError = lift . throwError
+  catchError w  = catchError (coerce $ fromSexpM w)
+
+tokenizeSexp :: Text -> [TTok]
+tokenizeSexp txt =  do
+  let spec = delims " \r\t" <> comment ";"
+                            <> punct "'{}()[]\n"
+                            <> sqq
+                            <> uw
+  tokenize spec txt
+
+runSexpM :: Monad m => SExpM m a -> m a
+runSexpM f = evalRWST (fromSexpM f) defEnv (SExpState 0 []) <&> fst
+
+
+parseSexp :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m (MicroSexp c)
+parseSexp txt = do
+  (s, _) <- runSexpM do
+             (s,rest) <- sexp (tokenizeSexp txt)
+             checkBraces
+             pure (s,rest)
+
+  pure s
+
+checkBraces :: (MonadError SExpParseError m) => SExpM m ()
+checkBraces = do
+  braces <- gets (view sexpBraces)
+  unless (null braces) $ raiseWith ParensUnder
+
+succLno :: (MonadError SExpParseError m) => SExpM m ()
+succLno = modify (over sexpLno succ)
+
+parseTop :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m [MicroSexp c]
+parseTop txt = do
+  let tokens = tokenizeSexp txt
+  S.toList_ $ runSexpM do
+    flip fix (mempty,tokens) $ \next -> \case
+      (acc, []) -> do
+        emit acc
+      (acc, TPunct '\n' : rest) -> do
+        succLno
+        emit acc
+        next (mempty,rest)
+      (acc, rest) -> do
+        (s, xs) <- sexp rest
+        next (acc <> [s],xs)
+
+  where
+
+    emit [] = pure ()
+    emit wtf = case wtf of
+      [List one] -> lift $ S.yield (List one)
+      xs    -> lift $ S.yield (List xs)
+
+sexp :: (ForMicroSexp c, MonadError SExpParseError m) => [TTok] -> SExpM m (MicroSexp c, [TTok])
+sexp s = case s of
+  [] -> do
+    checkBraces
+    pure (nil, mempty)
+
+  (TText l : w) -> (,w) <$> trNum (Symbol l)
+
+  (TStrLit l : w) -> pure (String l, w)
+
+  -- so far ignored
+  (TPunct '\'' : rest) -> sexp rest
+
+  (TPunct '\n' : rest) -> succLno >> sexp rest
+
+  (TPunct c : rest) | isSpace c  -> sexp rest
+
+  (TPunct c : rest) | isBrace c  ->
+    maybe (pure (nil, rest)) (`list` rest) (closing c)
+                    | otherwise -> do
+                        raiseWith ParensOver
+
+  ( _ : _ ) -> raiseWith SyntaxError
+
+  where
+
+    setContext w = do
+      co <- getC0
+      pure $ over _2 (set contextOf co) w
+
+    isBrace :: Char -> Bool
+    isBrace c = HM.member c braces
+
+    closing :: Char -> Maybe Char
+    closing c = HM.lookup c braces
+
+    braces :: HashMap Char Char
+    braces = HM.fromList[ ('{', '}')
+                        , ('(', ')')
+                        , ('[', ']')
+                        , ('<', '>')
+                        ]
+
+    cBraces :: [Char]
+    cBraces = HM.elems braces
+
+    trNum tok = do
+
+      trans <- asks sexpTranslate
+
+      case tok of
+        Symbol s | trans -> do
+          let s0 = Text.unpack s
+
+          let what = Number . NumInteger <$> readMay @Integer s0
+                    <|>
+                    Number . NumInteger <$> parseBinary s0
+                    <|>
+                    Number . NumDouble <$> readMay @Scientific s0
+                    <|>
+                    ( case  s of
+                        "#t" -> Just (Boolean True)
+                        "#f" -> Just (Boolean False)
+                        _    -> Nothing
+                    )
+
+          pure $ fromMaybe (Symbol s) what
+
+
+        x        -> pure x
+    {-# INLINE trNum #-}
+
+    list :: (ForMicroSexp c, MonadError SExpParseError m)
+         => Char
+         -> [TTok]
+         -> SExpM m (MicroSexp c, [TTok])
+
+    list _ [] = raiseWith ParensUnder
+
+    list cb tokens = do
+      modify $ over sexpBraces (cb:)
+
+      go cb mempty tokens
+
+      where
+
+        isClosingFor :: Char -> Bool
+        isClosingFor c = c `elem` cBraces
+
+        go _ _ [] = do
+          checkBraces
+          pure (List mempty, mempty)
+
+        go cl acc (TPunct c : rest) | isSpace c = do
+          go cl acc rest
+
+        go cl acc (TPunct c : rest)
+          | isClosingFor c && c == cl = do
+              modify $ over sexpBraces (drop 1)
+              pure (List (reverse acc), rest)
+
+          | isClosingFor c && c /= cl = do
+              raiseWith ParensUnmatched
+              -- throwError =<< ParensUnmatched <$> undefined
+
+        go cl acc rest = do
+          (e,r) <- sexp rest
+          go cl (e : acc) r
+
+
+getC0 :: Monad m => SExpM m C0
+getC0 = do
+  lno <- gets (view sexpLno)
+  pure (C0 (Just lno))
+
+raiseWith :: (MonadError SExpParseError m)
+          => (C0 -> SExpParseError) -> SExpM m b
+
+raiseWith a = throwError =<< a <$> getC0
+
+instance Pretty NumType where
+   pretty = \case
+    NumInteger n -> pretty n
+    NumDouble  n -> viaShow n
+
+instance ForMicroSexp c => Pretty (MicroSexp c) where
+
+  pretty = \case
+    List xs   -> parens (hsep (fmap pretty xs))
+    String s  -> dquotes (pretty s)
+    Symbol s  -> pretty s
+    Number n  -> pretty n
+    Boolean True -> pretty  "#t"
+    Boolean False -> pretty  "#f"
+
+isBinaryDigit :: Char -> Bool
+isBinaryDigit c = c == '0' || c == '1'
+
+parseBinary :: String -> Maybe Integer
+parseBinary str =
+  let
+      withoutPrefix = case str of
+                        '0':'b':rest -> Just rest
+                        '0':'B':rest -> Just rest
+                        _            -> Nothing
+  in if isJust withoutPrefix && all isBinaryDigit (fromJust withoutPrefix)
+     then Just $ foldl (\acc x -> acc * 2 + toInteger (digitToInt x)) 0 (fromJust withoutPrefix)
+     else Nothing
+
--- a/src/Data/Text/Fuzzy/Section.hs
+++ b/src/Data/Text/Fuzzy/Section.hs
@ -0,0 +1,15 @@
+module Data.Text.Fuzzy.Section (cutSectionBy, cutSectionOn) where
+
+import Data.Text (Text)
+import qualified Data.List as List
+
+
+cutSectionOn :: Text -> Text -> [Text] -> [Text]
+cutSectionOn a b txt = cutSectionBy ((==)a) ((==b)) txt
+
+cutSectionBy :: (Text -> Bool) -> (Text -> Bool) -> [Text] -> [Text]
+cutSectionBy a b txt = cutI
+  where
+    cutC = List.dropWhile (not . a) txt
+    cutI = List.takeWhile (not . b) cutC
+
--- a/src/Data/Text/Fuzzy/Tokenize.hs
+++ b/src/Data/Text/Fuzzy/Tokenize.hs
@ -0,0 +1,556 @@
+-- |
+-- Module      :  Data.Text.Fuzzy.Tokenize
+-- Copyright   :  Dmitry Zuikov 2020
+-- License     :  MIT
+--
+-- Maintainer  :  dzuikov@gmail.com
+-- Stability   :  experimental
+-- Portability :  unknown
+--
+-- The lightweight and multi-functional text tokenizer allowing different types of text tokenization
+-- depending on it's settings.
+--
+-- It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
+-- easier and sometimes faster than in case of usage mainstream parsing combinators or parser
+-- generators.
+--
+-- The primary goal of this package  is to parse unstructured text data, however it may be  used for
+-- parsing  such data formats as CSV with ease.
+--
+-- Currently it supports the following types of entities: atoms, string literals (currently with the
+-- minimal set of escaped characters), punctuation characters and delimeters.
+--
+-- == Examples
+-- === Simple CSV-like tokenization
+-- >>> tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
+-- ["aaa "," bebeb "," qqq "]
+--
+-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
+-- ["aaa "," bebeb "," qqq ","","","",""]
+--
+-- >>>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
+-- [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
+--
+-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
+-- [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
+--
+-- >>> let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
+-- >>> tokenize spec "   aaa :   'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
+-- [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
+--
+-- >>> let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
+-- >>> tokenize spec "  a  b  c  : 'bebeb:colon inside' : qqq ::::"  :: [Maybe Text]
+-- [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
+--
+-- == Notes
+--
+-- === About the delimeter tokens
+-- This type of tokens appears during a "delimited"
+-- formats processing and disappears in results. Currenly
+-- you will never see it unless normalization is turned off by 'nn' option.
+--
+-- The delimeters make sense in case of processing the CSV-like formats,
+-- but in this case you probably need only values in results.
+--
+-- This behavior may be changed later. But right now delimeters seem pointless
+-- in results. If you process some sort of grammar where delimeter character
+-- is important, you may use punctuation instead, i.e:
+--
+-- >>> let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
+-- >>> tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
+-- ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
+--
+-- == Other
+-- For CSV-like formats it makes sense to split text to lines first,
+-- otherwise newline characters may cause to weird results
+--
+--
+
+module Data.Text.Fuzzy.Tokenize ( TokenizeSpec
+                                , IsToken(..)
+                                , tokenize
+                                , esc
+                                , addEmptyFields
+                                , emptyFields
+                                , nn
+                                , sq
+                                , sqq
+                                , noslits
+                                , sl
+                                , sr
+                                , uw
+                                , delims
+                                , comment
+                                , punct
+                                , indent
+                                , itabstops
+                                , keywords
+                                , eol
+                                ) where
+
+import Prelude hiding (init)
+
+import Control.Applicative
+import Data.Map (Map)
+import Data.Maybe (fromMaybe)
+import Data.Monoid()
+import Data.Set (Set)
+import Data.Text (Text)
+import qualified Data.List as List
+import qualified Data.Map as Map
+import qualified Data.Set as Set
+import qualified Data.Text as Text
+
+import Control.Monad (when)
+import Control.Monad.RWS
+
+-- | Tokenization settings. Use mempty for an empty value
+-- and construction functions for changing the settings.
+--
+data TokenizeSpec = TokenizeSpec { tsAtoms          :: Set Text
+                                 , tsStringQQ       :: Maybe Bool
+                                 , tsStringQ        :: Maybe Bool
+                                 , tsNoSlits        :: Maybe Bool
+                                 , tsLineComment    :: Map Char Text
+                                 , tsDelims         :: Set Char
+                                 , tsEol            :: Maybe Bool
+                                 , tsStripLeft      :: Maybe Bool
+                                 , tsStripRight     :: Maybe Bool
+                                 , tsUW             :: Maybe Bool
+                                 , tsNotNormalize   :: Maybe Bool
+                                 , tsEsc            :: Maybe Bool
+                                 , tsAddEmptyFields :: Maybe Bool
+                                 , tsPunct          :: Set Char
+                                 , tsIndent         :: Maybe Bool
+                                 , tsItabStops      :: Maybe Int
+                                 , tsKeywords       :: Set Text
+                                 }
+                    deriving (Eq,Ord,Show)
+
+
+instance Semigroup TokenizeSpec where
+  (<>) a b = TokenizeSpec { tsAtoms       = tsAtoms b <> tsAtoms a
+                          , tsStringQQ    = tsStringQQ b <|> tsStringQQ a
+                          , tsStringQ     = tsStringQ b  <|> tsStringQ a
+                          , tsNoSlits     = tsNoSlits b <|> tsNoSlits a
+                          , tsLineComment = tsLineComment b <> tsLineComment a
+                          , tsDelims      = tsDelims b <> tsDelims a
+                          , tsEol         = tsEol b <|> tsEol a
+                          , tsStripLeft   = tsStripLeft b <|> tsStripLeft a
+                          , tsStripRight  = tsStripRight b <|> tsStripRight a
+                          , tsUW          = tsUW b <|> tsUW a
+                          , tsNotNormalize = tsNotNormalize b <|> tsNotNormalize a
+                          , tsEsc         = tsEsc b <|> tsEsc a
+                          , tsAddEmptyFields = tsAddEmptyFields b <|> tsAddEmptyFields a
+                          , tsPunct = tsPunct b <> tsPunct a
+                          , tsIndent = tsIndent b <|> tsIndent a
+                          , tsItabStops = tsItabStops b <|> tsItabStops a
+                          , tsKeywords = tsKeywords b <> tsKeywords a
+                          }
+
+instance Monoid TokenizeSpec where
+  mempty = TokenizeSpec { tsAtoms = mempty
+                        , tsStringQQ = Nothing
+                        , tsStringQ  = Nothing
+                        , tsNoSlits = Nothing
+                        , tsLineComment = mempty
+                        , tsDelims = mempty
+                        , tsEol = Nothing
+                        , tsStripLeft = Nothing
+                        , tsStripRight = Nothing
+                        , tsUW = Nothing
+                        , tsNotNormalize = Nothing
+                        , tsEsc = Nothing
+                        , tsAddEmptyFields = Nothing
+                        , tsPunct = mempty
+                        , tsIndent = Nothing
+                        , tsItabStops = Nothing
+                        , tsKeywords = mempty
+                        }
+
+
+justTrue :: Maybe Bool -> Bool
+justTrue (Just True) = True
+justTrue _ = False
+
+-- | Turns on EOL token generation
+eol :: TokenizeSpec
+eol = mempty { tsEol = pure True }
+
+-- | Turn on character escaping inside string literals.
+-- Currently the following escaped characters are
+-- supported: [" ' \ t n r a b f v ]
+esc :: TokenizeSpec
+esc = mempty { tsEsc = pure True }
+
+-- | Raise empty field tokens (note mkEmpty method)
+-- when no tokens found before a delimeter.
+-- Useful for processing CSV-like data in
+-- order to distingush empty columns
+addEmptyFields :: TokenizeSpec
+addEmptyFields = mempty { tsAddEmptyFields = pure True }
+
+-- | same as addEmptyFields
+emptyFields :: TokenizeSpec
+emptyFields = addEmptyFields
+
+-- | Turns off token normalization. Makes the tokenizer
+-- generate character stream. Useful for debugging.
+nn :: TokenizeSpec
+nn = mempty { tsNotNormalize = pure True }
+
+-- | Turns on single-quoted string literals.
+-- Character stream after '\'' character
+-- will be proceesed as single-quoted stream,
+-- assuming all delimeter, comment and other special
+-- characters as a part of the string literal until
+-- the next unescaped single quote character.
+sq :: TokenizeSpec
+sq = mempty { tsStringQ = pure True }
+
+-- | Enable double-quoted string literals support
+-- as 'sq' for single-quoted strings.
+sqq :: TokenizeSpec
+sqq = mempty { tsStringQQ = pure True }
+
+-- | Disable separate string literals.
+--
+-- Useful when processed delimeted data (csv-like formats).
+-- Normally, sequential text chunks are concatenated together,
+-- but consequent text and string literal will produce the two
+-- different tokens and it may cause weird results if data
+-- is in csv-like format, i.e:
+--
+-- >>> tokenize (delims ":"<>emptyFields<>sq ) "aaa:bebe:'qq' aaa:next::" :: [Maybe Text]
+-- [Just "aaa",Just "bebe",Just "qq",Just " aaa",Just "next",Nothing,Nothing]
+--
+-- look: "qq" and " aaa" are turned into two separate tokens that makes the result
+-- of CSV processing looks improper, like it has an extra-column. This behavior may be
+-- avoided using this option, if you don't need to distinguish text chunks and string
+-- literals:
+--
+-- >>> tokenize (delims ":"<>emptyFields<>sq<>noslits) "aaa:bebe:'qq:foo' aaa:next::" :: [Maybe Text]
+-- [Just "aaa",Just "bebe",Just "qq:foo aaa",Just "next",Nothing,Nothing]
+--
+noslits :: TokenizeSpec
+noslits = mempty { tsNoSlits = pure True }
+
+-- | Specify the list of delimers (characters)
+-- to split the character stream into fields.  Useful for CSV-like separated formats.  Support for
+-- empty fields in token stream may be enabled by 'addEmptyFields' function
+delims :: String -> TokenizeSpec
+delims s = mempty { tsDelims = Set.fromList s }
+
+-- | Strip spaces on left side of a token.
+-- Does not affect string literals, i.e string are processed normally. Useful mostly for
+-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
+sl :: TokenizeSpec
+sl = mempty { tsStripLeft = pure True }
+
+-- | Strip spaces on right side of a token.
+-- Does not affect string literals, i.e string are processed normally. Useful mostly for
+-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
+sr :: TokenizeSpec
+sr = mempty { tsStripRight = pure True }
+
+-- | Strips spaces on right and left sides and transforms multiple spaces into the one.
+-- Name origins from  unwords . words
+--
+-- Does not affect string literals, i.e string are processed normally. Useful mostly for
+-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
+uw :: TokenizeSpec
+uw = mempty { tsUW = pure True }
+
+-- | Specify the line comment prefix.
+-- All text after the line comment prefix will
+-- be ignored until the newline character appearance.
+-- Multiple line comments are supported.
+comment :: Text -> TokenizeSpec
+comment s = mempty { tsLineComment = cmt }
+  where
+    cmt = case Text.uncons s of
+            Just (p,su) -> Map.singleton p su
+            Nothing     -> mempty
+
+-- | Specify the punctuation characters.
+-- Any punctuation character is handled as a separate
+-- token.
+-- Any token will be breaked on a punctiation character.
+--
+-- Useful for handling ... er... punctuaton, like
+--
+-- >> function(a,b)
+--
+-- or
+--
+-- >> (apply function 1 2 3)
+--
+--
+-- >>> tokenize spec "(apply function 1 2 3)" :: [Text]
+-- ["(","apply","function","1","2","3",")"]
+--
+punct :: Text -> TokenizeSpec
+punct s = mempty { tsPunct = Set.fromList (Text.unpack s) }
+
+-- | Specify the keywords list.
+-- Each keyword will be threated as a separate token.
+keywords :: [Text] -> TokenizeSpec
+keywords s = mempty { tsKeywords = Set.fromList s }
+
+-- | Enable identation support
+indent :: TokenizeSpec
+indent = mempty { tsIndent = Just True }
+
+-- | Set tab expanding multiplier
+-- i.e. each tab extends into n spaces before processing.
+-- It also turns on the indentation. Only the tabs at the beginning of the string are expanded,
+-- i.e. before the first non-space character appears.
+itabstops :: Int -> TokenizeSpec
+itabstops n = mempty { tsIndent = Just True, tsItabStops = pure n }
+
+newtype TokenizeM w a = TokenizeM (RWS TokenizeSpec w () a)
+                        deriving( Applicative
+                                , Functor
+                                , MonadReader TokenizeSpec
+                                , MonadWriter w
+                                , MonadState  ()
+                                , Monad
+                                )
+
+data Token = TChar Char
+           | TSChar Char
+           | TPunct Char
+           | TText Text
+           | TSLit Text
+           | TKeyword Text
+           | TEmpty
+           | TDelim
+           | TIndent Int
+           | TEol
+           deriving (Eq,Ord,Show)
+
+-- | Typeclass for token values.
+-- Note, that some tokens appear in results
+-- only when 'nn' option is set, i.e. sequences
+-- of characters turn out to text tokens or string literals
+-- and delimeter tokens are just removed from the
+-- results
+class IsToken a where
+  -- |  Create a character token
+  mkChar   :: Char -> a
+  -- | Create a string literal character token
+  mkSChar  :: Char -> a
+  -- | Create a punctuation token
+  mkPunct  :: Char -> a
+  -- | Create a text chunk token
+  mkText   :: Text -> a
+  -- | Create a string literal token
+  mkStrLit :: Text -> a
+  -- | Create a keyword token
+  mkKeyword :: Text -> a
+  -- | Create an empty field token
+  mkEmpty  :: a
+  -- | Create a delimeter token
+  mkDelim  :: a
+  mkDelim = mkEmpty
+
+  -- | Creates an indent token
+  mkIndent :: Int -> a
+  mkIndent = const mkEmpty
+
+  -- | Creates an EOL token
+  mkEol :: a
+  mkEol = mkEmpty
+
+instance IsToken (Maybe Text) where
+  mkChar = pure . Text.singleton
+  mkSChar = pure . Text.singleton
+  mkPunct = pure . Text.singleton
+  mkText = pure
+  mkStrLit = pure
+  mkKeyword = pure
+  mkEmpty = Nothing
+
+instance IsToken Text where
+  mkChar   = Text.singleton
+  mkSChar  = Text.singleton
+  mkPunct  = Text.singleton
+  mkText   = id
+  mkStrLit = id
+  mkKeyword = id
+  mkEmpty  = ""
+
+-- | Tokenize a text
+tokenize :: IsToken a => TokenizeSpec -> Text -> [a]
+tokenize s t = map tr t1
+  where
+    t1 = tokenize' s t
+    tr (TChar c) = mkChar c
+    tr (TSChar c) = mkSChar c
+    tr (TText c) = mkText c
+    tr (TSLit c) = mkStrLit c
+    tr (TKeyword c) = mkKeyword c
+    tr TEmpty  = mkEmpty
+    tr (TPunct c) = mkPunct c
+    tr TDelim  = mkDelim
+    tr (TIndent n) = mkIndent n
+    tr TEol = mkEol
+
+execTokenizeM :: TokenizeM [Token] a -> TokenizeSpec -> [Token]
+execTokenizeM (TokenizeM m) spec =
+  let (_,w) = execRWS m spec () in norm w
+
+  where norm x | justTrue (tsNotNormalize spec) = x
+               | otherwise = normalize spec x
+
+tokenize' :: TokenizeSpec -> Text -> [Token]
+tokenize' spec txt = execTokenizeM (root' txt) spec
+  where
+
+    r = spec
+
+    noIndent = not doIndent
+    doIndent = justTrue (tsIndent r)
+    eolOk = justTrue (tsEol r)
+
+    root' x = scanIndent x >>= root
+
+    root ts = do
+
+      case Text.uncons ts of
+        Nothing           -> pure ()
+
+        Just ('\n', rest) | doIndent                  -> raiseEol >> root' rest
+        Just (c, rest)    | Set.member c (tsDelims r) -> tell [TDelim]  >> root rest
+        Just ('\'', rest) | justTrue (tsStringQ r)    -> scanQ '\'' rest
+        Just ('"', rest)  | justTrue (tsStringQQ r)   -> scanQ '"' rest
+
+        Just (c, rest)    | Map.member c (tsLineComment r) -> scanComment (c,rest)
+
+        Just (c, rest)    | Set.member c (tsPunct r)  -> tell [TPunct c] >> root rest
+
+        Just (c, rest)    | otherwise                 -> tell [TChar c] >> root rest
+
+
+    raiseEol | eolOk = tell [TEol]
+             | otherwise = pure ()
+
+    expandSpace ' '  = 1
+    expandSpace '\t' = (fromMaybe 8 (tsItabStops r))
+    expandSpace _    = 0
+
+    scanIndent x | noIndent = pure x
+                 | otherwise = do
+      let (ss,as) = Text.span (\c -> c == ' ' || c == '\t') x
+      tell [ TIndent (sum (map expandSpace (Text.unpack ss))) ]
+      pure as
+
+    scanComment (c,rest) = do
+      suff <- Map.lookup c <$> asks tsLineComment
+      case suff of
+        Just t | Text.isPrefixOf t rest -> do
+           root $ Text.dropWhile ('\n' /=) rest
+
+        _  -> tell [TChar c] >> root rest
+
+    scanQ q ts = do
+
+      case Text.uncons ts of
+        Nothing           -> root ts
+
+        Just ('\\', rest) | justTrue (tsEsc r) -> unesc (scanQ q) rest
+                          | otherwise          -> tell [tsChar '\\'] >> scanQ q rest
+
+        Just (c, rest) | c ==  q   -> root rest
+                       | otherwise -> tell [tsChar c] >> scanQ q rest
+
+    unesc f ts =
+      case Text.uncons ts of
+        Nothing -> f ts
+        Just ('"', rs)  -> tell [tsChar '"' ]  >> f rs
+        Just ('\'', rs) -> tell [tsChar '\''] >> f rs
+        Just ('\\', rs) -> tell [tsChar '\\'] >> f rs
+        Just ('t', rs)  -> tell [tsChar '\t'] >> f rs
+        Just ('n', rs)  -> tell [tsChar '\n'] >> f rs
+        Just ('r', rs)  -> tell [tsChar '\r'] >> f rs
+        Just ('a', rs)  -> tell [tsChar '\a'] >> f rs
+        Just ('b', rs)  -> tell [tsChar '\b'] >> f rs
+        Just ('f', rs)  -> tell [tsChar '\f'] >> f rs
+        Just ('v', rs)  -> tell [tsChar '\v'] >> f rs
+        Just (_, rs)    -> f rs
+
+    tsChar c | justTrue (tsNoSlits spec) = TChar c
+             | otherwise = TSChar c
+
+newtype NormStats = NormStats { nstatBeforeDelim :: Int }
+
+normalize :: TokenizeSpec -> [Token] -> [Token]
+normalize spec tokens = snd $ execRWS (go tokens) () init
+  where
+
+    go [] = addEmptyField
+
+    go s@(TIndent _ : _) = do
+      let (iis, rest') = List.span isIndent s
+      tell [TIndent (sum [k | TIndent k <- iis])]
+      go rest'
+
+    go (TChar c0 : cs) = do
+      let (n,ns) = List.span isTChar cs
+      succStat
+      let chunk = eatSpaces $ Text.pack (c0 : [ c | TChar c <- n])
+      let kw = Set.member chunk (tsKeywords spec)
+      tell [ if kw then TKeyword chunk else TText chunk ]
+      go ns
+
+    go (TSChar x : xs) = do
+      let (n,ns) = List.span isTSChar xs
+      succStat
+      tell [ TSLit $ Text.pack (x : [ c | TSChar c <- n]) ]
+      go ns
+
+    go (TDelim : xs) = do
+      addEmptyField
+      pruneStat
+      go xs
+
+    go (TPunct c : xs) = do
+      tell [ TPunct c ]
+      succStat
+      go xs
+
+    go (x:xs) = tell [x] >> go xs
+
+    succStat = do
+      modify (\x -> x { nstatBeforeDelim = succ (nstatBeforeDelim x)})
+
+    pruneStat = do
+      modify (\x -> x { nstatBeforeDelim = 0 } )
+
+    addEmptyField = do
+      ns <- gets nstatBeforeDelim
+      when  (ns == 0 && justTrue (tsAddEmptyFields spec) ) $ do
+        tell [ TEmpty ]
+
+    isTChar (TChar _) = True
+    isTChar _         = False
+
+    isTSChar (TSChar _) = True
+    isTSChar _          = False
+
+    isIndent (TIndent _) = True
+    isIndent _           = False
+
+    init = NormStats { nstatBeforeDelim = 0 }
+
+    eatSpaces s | sboth  = Text.strip s
+                | sLonly = Text.stripStart s
+                | sRonly = Text.stripEnd s
+                | sWU    = (Text.unwords . Text.words) s
+                | otherwise = s
+
+      where sboth  = justTrue (tsStripLeft spec) && justTrue (tsStripRight spec)
+            sLonly = justTrue (tsStripLeft spec) && not (justTrue (tsStripRight spec))
+            sRonly = not (justTrue (tsStripLeft spec)) && justTrue (tsStripRight spec)
+            sWU    = justTrue (tsUW spec)
+
--- a/src/Data/Text/LSH.hs
+++ b/src/Data/Text/LSH.hs
@ -0,0 +1 @@
+module Data.Text.LSH where
--- a/test/FuzzyParseSpec.hs
+++ b/test/FuzzyParseSpec.hs
@ -0,0 +1,150 @@
+{-# LANGUAGE   OverloadedStrings
+             , QuasiQuotes
+             , ExtendedDefaultRules
+             , LambdaCase
+             , ImportQualifiedPost
+             , DerivingStrategies
+             , PatternSynonyms
+             , ViewPatterns
+             , MultiWayIf
+             , TemplateHaskell
+#-}
+
+module FuzzyParseSpec (spec) where
+
+import Data.Text (Text)
+import Test.Hspec
+import Text.InterpolatedString.Perl6 (q)
+
+import Data.Text.Fuzzy.Tokenize
+import Data.Data
+import Data.Generics.Uniplate.Data()
+import GHC.Generics
+
+data TTok = TChar Char
+          | TSChar Char
+          | TPunct Char
+          | TText Text
+          | TStrLit Text
+          | TKeyword Text
+          | TEmpty
+          | TIndent Int
+          deriving stock (Eq,Ord,Show,Data,Generic)
+
+instance IsToken TTok where
+  mkChar = TChar
+  mkSChar = TSChar
+  mkPunct = TPunct
+  mkText = TText
+  mkStrLit = TStrLit
+  mkKeyword = TKeyword
+  mkEmpty = TEmpty
+  mkIndent = TIndent
+
+
+
+spec :: Spec
+spec = do
+  describe "csv-like" $ do
+    it "splits text using ':' delimeter" $ do
+      let toks = tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
+      toks `shouldBe` ["aaa "," bebeb "," qqq "]
+
+    it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
+      let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
+      toks `shouldBe` ["aaa "," bebeb "," qqq ","","","",""]
+
+    it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
+      let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
+      toks `shouldBe` [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
+
+    it "splits text using ':' delimeter with single-quotes string and empty fields with noslits" $ do
+      let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
+      let toks =  tokenize spec "   aaa :   'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
+      toks `shouldBe` [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
+
+    it "splits text using ':' delimeter with single-quotes string and empty fields with noslits and uw" $ do
+      let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
+      let toks = tokenize spec "  a  b  c  : 'bebeb:colon inside' : qqq ::::"  :: [Maybe Text]
+      toks `shouldBe` [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
+
+    it "uses punctuation tokens" $ do
+      let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
+      let toks = tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
+      toks `shouldBe` ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
+
+
+    it "tokenize simple lisp-like text with keywords" $ do
+      let spec = delims " \n\t" <> comment ";"
+                                <> punct "{}()[]<>"
+                                <> sq <> sqq
+                                <> uw
+                                <> keywords ["define","apply","+"]
+
+      let code = [q|
+        (define add (a b ) ; define simple function
+          (+ a b) )
+        (define r (add 10 20))
+|]
+
+      let toks = tokenize spec code :: [TTok]
+
+      let expected = [ TPunct '('
+                     , TKeyword "define"
+                     , TText "add" , TPunct '(', TText "a" , TText "b", TPunct ')'
+                     , TPunct '(', TKeyword "+", TText "a",TText "b",TPunct ')',TPunct ')'
+                     , TPunct '(',TKeyword "define"
+                                  ,TText "r"
+                                  ,TPunct '(',TText "add",TText "10",TText "20"
+                                  ,TPunct ')',TPunct ')']
+
+      toks `shouldBe` expected
+
+
+    describe "Checks indentation support" $ do
+
+      let spec = delims " \n\t" <> comment ";"
+                                <> punct "{}()[]<>"
+                                <> sq <> sqq
+                                <> uw
+                                <> indent
+                                <> itabstops 8
+                                <> keywords ["define"]
+
+
+
+      it "parses some indented blocks" $ do
+
+        let expected = [ TIndent 0, TKeyword "define", TText "a", TText "0"
+                       , TIndent 2, TText "atom", TText "foo", TText "2"
+                       , TIndent 2, TKeyword "define", TText "aq", TText "2"
+                       , TIndent 4, TText "atom", TText "one", TText "4"
+                       , TIndent 4, TText "atom", TText "two", TText "4"
+                       , TIndent 0, TKeyword "define", TText "b", TText "0"
+                       , TIndent 2, TText "atom", TText "baar", TText "2"
+                       , TIndent 2, TText "atom", TText "quux", TText "2"
+                       , TIndent 2, TKeyword "define", TText "new", TText "2"
+                       , TIndent 6, TText "atom", TText "bar", TText "6"
+                       , TIndent 4, TText "atom", TText "fuu", TText "4"
+                       , TIndent 0
+                       ]
+
+        let pyLike = [q|
+define a      0
+  atom foo    2
+  define aq   2
+    atom one  4
+    atom two  4
+
+define  b       0
+  atom baar     2
+  atom quux     2
+  define new    2
+      atom bar  6
+    atom fuu    4
+
+|]
+        let toks = tokenize spec pyLike :: [TTok]
+        toks `shouldBe` expected
+
+
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -0,0 +1 @@
+{-# OPTIONS_GHC -F -pgmF hspec-discover #-}