From cf85d2df2af00bf8c8d59666aa16ee4a85a3ba20 Mon Sep 17 00:00:00 2001 From: voidlizard Date: Mon, 7 Oct 2024 05:06:03 +0300 Subject: [PATCH] Squashed 'miscellaneous/fuzzy-parse/' content from commit a834b152e git-subtree-dir: miscellaneous/fuzzy-parse git-subtree-split: a834b152e29d632c816eefe117036e5d9330bd03 --- .envrc | 5 + .gitignore | 6 + CHANGELOG.md | 10 + LICENSE | 20 + README.markdown | 131 ++++++ TODO | 12 + cabal.project | 6 + flake.lock | 61 +++ flake.nix | 69 +++ fuzzy-parse.cabal | 160 +++++++ misc/FuzzySexpParse.hs | 30 ++ nix/derivations/.gitignore | 0 nix/pkgs.json | 7 + nix/pkgs.nix | 5 + nix/release.nix | 29 ++ nix/update-nixpkgs.sh | 3 + scripts/upload-docs.sh | 10 + src/Data/Text/Fuzzy/Attoparsec/Day.hs | 97 +++++ src/Data/Text/Fuzzy/Attoparsec/Month.hs | 46 ++ src/Data/Text/Fuzzy/Dates.hs | 46 ++ src/Data/Text/Fuzzy/SExp.hs | 378 ++++++++++++++++ src/Data/Text/Fuzzy/Section.hs | 15 + src/Data/Text/Fuzzy/Tokenize.hs | 556 ++++++++++++++++++++++++ src/Data/Text/LSH.hs | 1 + test/FuzzyParseSpec.hs | 150 +++++++ test/Spec.hs | 1 + 26 files changed, 1854 insertions(+) create mode 100644 .envrc create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 LICENSE create mode 100644 README.markdown create mode 100644 TODO create mode 100644 cabal.project create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 fuzzy-parse.cabal create mode 100644 misc/FuzzySexpParse.hs create mode 100644 nix/derivations/.gitignore create mode 100644 nix/pkgs.json create mode 100644 nix/pkgs.nix create mode 100644 nix/release.nix create mode 100755 nix/update-nixpkgs.sh create mode 100755 scripts/upload-docs.sh create mode 100644 src/Data/Text/Fuzzy/Attoparsec/Day.hs create mode 100644 src/Data/Text/Fuzzy/Attoparsec/Month.hs create mode 100644 src/Data/Text/Fuzzy/Dates.hs create mode 100644 src/Data/Text/Fuzzy/SExp.hs create mode 100644 src/Data/Text/Fuzzy/Section.hs create mode 100644 src/Data/Text/Fuzzy/Tokenize.hs create mode 100644 src/Data/Text/LSH.hs create mode 100644 test/FuzzyParseSpec.hs create mode 100644 test/Spec.hs diff --git a/.envrc b/.envrc new file mode 100644 index 00000000..ce34212a --- /dev/null +++ b/.envrc @@ -0,0 +1,5 @@ +if [ -f .envrc.local ]; then + source_env .envrc.local +fi + +use flake diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..7a25a854 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.swp +dist-newstyle/ +Setup.hs + +.direnv +.hbs2-git/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..374e936c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Revision history for fuzzy-parse + +## 0.1.2.0 + + - Techical release + - Added some missed things + +## 0.1.0.0 -- YYYY-mm-dd + +* First version. Released on an unsuspecting world. diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..232522e1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2019 Dmitry Zuikov + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.markdown b/README.markdown new file mode 100644 index 00000000..30eb7e38 --- /dev/null +++ b/README.markdown @@ -0,0 +1,131 @@ +# About + +# Data.Text.Fuzzy.Tokenize + +The lightweight and multi-functional text tokenizer allowing different types of text tokenization +depending on it's settings. + +It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars +easier and sometimes faster than in case of usage mainstream parsing combinators or parser +generators. + +The primary goal of this package is to parse unstructured text data, however it may be used for +parsing such data formats as CSV with ease. + +Currently it supports the following types of entities: atoms, string literals (currently with the +minimal set of escaped characters), punctuation characters and delimeters. + +## Examples + +### Simple CSV-like tokenization + +```haskell +tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text] + +["aaa "," bebeb "," qqq "] +``` + +```haskell +tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text] + +["aaa "," bebeb "," qqq ","","","",""] +``` + +```haskell +tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text] + +[Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] +``` + +```haskell +tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + +[Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] +``` + +```haskell +let spec = sl<>delims ":"<>sq<>emptyFields<>noslits +tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + +[Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing] +``` + +```haskell +let spec = delims ":"<>sq<>emptyFields<>uw<>noslits +tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + +[Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing] +``` + +### Primitive lisp-like language + +```haskell +{-# LANGUAGE QuasiQuotes, ExtendedDefaultRules #-} + +import Text.InterpolatedString.Perl6 (q) +import Data.Text.Fuzzy.Tokenize + +data TTok = TChar Char + | TSChar Char + | TPunct Char + | TText Text + | TStrLit Text + | TKeyword Text + | TEmpty + deriving(Eq,Ord,Show) + +instance IsToken TTok where + mkChar = TChar + mkSChar = TSChar + mkPunct = TPunct + mkText = TText + mkStrLit = TStrLit + mkKeyword = TKeyword + mkEmpty = TEmpty + +main = do + + let spec = delims " \n\t" <> comment ";" + <> punct "{}()[]<>" + <> sq <> sqq + <> uw + <> keywords ["define","apply","+"] + let code = [q| + (define add (a b ) ; define simple function + (+ a b) ) + (define r (add 10 20)) +|] + let toks = tokenize spec code :: [TTok] + + print toks +``` + +## Notes + +### About the delimeter tokens +This type of tokens appears during a "delimited" formats processing and disappears in results. +Currenly you will never see it unless normalization is turned off by 'nn' option. + +The delimeters make sense in case of processing the CSV-like formats, but in this case you probably +need only values in results. + +This behavior may be changed later. But right now delimeters seem pointless in results. If you +process some sort of grammar where delimeter character is important, you may use punctuation +instead, i.e: + +```haskell +let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq +tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text] + +["(","delimeters",",","are",",","important",",","spaces are not",")",";"] +``` + +### Other +For CSV-like formats it makes sense to split text to lines first, otherwise newline characters may +cause to weird results + + +# Authors + +This library is written and maintained by Dmitry Zuikov, dzuikov@gmail.com + diff --git a/TODO b/TODO new file mode 100644 index 00000000..fc8195fe --- /dev/null +++ b/TODO @@ -0,0 +1,12 @@ + + +- [ ] TODO: Tests for Data.Text.Fuzzy.Section +- [ ] TODO: haddock for Data.Text.Fuzzy.Section + +- [~] TODO: Tests +- [+] TODO: Freeze dependencies versions +- [ ] TODO: Version number +- [+] TODO: Haddocks +- [ ] TODO: Tokenizer: Identation support +- [ ] TODO: Tokenizer: Block comments support + diff --git a/cabal.project b/cabal.project new file mode 100644 index 00000000..b6dfabf9 --- /dev/null +++ b/cabal.project @@ -0,0 +1,6 @@ +packages: *.cabal + +allow-newer: all + +tests: True + diff --git a/flake.lock b/flake.lock new file mode 100644 index 00000000..5aee32b5 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "locked": { + "lastModified": 1644229661, + "narHash": "sha256-1YdnJAsNy69bpcjuoKdOYQX0YxZBiCYZo4Twxerqv7k=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "3cecb5b042f7f209c56ffd8371b2711a290ec797", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "haskell-flake-utils": { + "inputs": { + "flake-utils": "flake-utils" + }, + "locked": { + "lastModified": 1707809372, + "narHash": "sha256-wfTL9PlCSOqSSyU4eenFFI7pHrV21gba4GEILnI4nAU=", + "owner": "ivanovs-4", + "repo": "haskell-flake-utils", + "rev": "3cbdc5d6093e8b4464ae64097e0c8c61e4414ff2", + "type": "github" + }, + "original": { + "owner": "ivanovs-4", + "repo": "haskell-flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1727089097, + "narHash": "sha256-ZMHMThPsthhUREwDebXw7GX45bJnBCVbfnH1g5iuSPc=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "568bfef547c14ca438c56a0bece08b8bb2b71a9c", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "haskell-flake-utils": "haskell-flake-utils", + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 00000000..97657169 --- /dev/null +++ b/flake.nix @@ -0,0 +1,69 @@ +{ +description = "Haskell cabal package"; + +inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + haskell-flake-utils.url = "github:ivanovs-4/haskell-flake-utils"; + + # another-simple-haskell-flake.url = "something"; + + # some-cabal-pkg.url = "github:example/some-cabal-pkg"; + # some-cabal-pkg.flake = false; +}; + +outputs = { self, nixpkgs, haskell-flake-utils, ... }@inputs: + haskell-flake-utils.lib.simpleCabal2flake { + inherit self nixpkgs; + # systems = [ "x86_64-linux" ]; + + # DON'T FORGET TO PUT YOUR PACKAGE NAME HERE, REMOVING `throw` + name = "fuzzy-parse"; + + shellExtBuildInputs = {pkgs}: with pkgs; [ + haskellPackages.haskell-language-server + ]; + + # Wether to build hoogle in the default shell + shellWithHoogle = true; + + + ## Optional parameters follow + + # nixpkgs config + # config = { }; + + # Add another haskell flakes as requirements + # haskellFlakes = [ inputs.another-simple-haskell-flake ]; + + # Use this to load other flakes overlays to supplement nixpkgs + # preOverlays = [ ]; + + # Pass either a function or a file + # preOverlay = ./overlay.nix; + + # Override haskell packages + # hpPreOverrides = { pkgs }: new: old: + # with pkgs.haskell.lib; with haskell-flake-utils.lib; + # tunePackages pkgs old { + # some-haskellPackages-package = [ dontHaddock ]; + # } // { + # some-cabal-pkg = ((jailbreakUnbreak pkgs) (dontCheck (old.callCabal2nix "some-cabal-pkg" inputs.some-cabal-pkg {}))); + # }; + + # Arguments for callCabal2nix + # cabal2nixArgs = {pkgs}: { + # }; + + # Maps to the devShell output. Pass in a shell.nix file or function + # shell = ./shell.nix + + # Additional build intputs of the default shell + # shellExtBuildInputs = {pkgs}: with pkgs; [ + # haskellPackages.haskell-language-server + # ]; + + # Wether to build hoogle in the default shell + # shellWithHoogle = true; + + }; +} diff --git a/fuzzy-parse.cabal b/fuzzy-parse.cabal new file mode 100644 index 00000000..a7541580 --- /dev/null +++ b/fuzzy-parse.cabal @@ -0,0 +1,160 @@ +cabal-version: 3.0 + +name: fuzzy-parse +version: 0.1.3.1 +synopsis: Tools for processing unstructured text data + +description: + The lightweight and easy to use functions for text tokenizing and parsing. It aimed for + parsing mostly unstructured data, but the structured formats may be parsed as well. + + It may be used in different sutiations, for DSL, tex markups or even for parsing simple + grammars easier and sometimes faster than in case of usage mainstream parsing combinators + or parser generators. + + See the README.markdown, examples and modules documentation for more. + +license: MIT +license-file: LICENSE +author: Dmitry Zuikov +maintainer: dzuikov@gmail.com + +category: Text, Parsing +extra-source-files: CHANGELOG.md + +homepage: https://github.com/hexresearch/fuzzy-parse +bug-reports: https://github.com/hexresearch/fuzzy-parse/issues + +extra-source-files: + README.markdown + +common shared-properties + + default-language: GHC2021 + + default-extensions: + ApplicativeDo + , BangPatterns + , BlockArguments + , ConstraintKinds + , DataKinds + , DeriveDataTypeable + , DeriveGeneric + , DerivingStrategies + , DerivingVia + , ExtendedDefaultRules + , FlexibleContexts + , FlexibleInstances + , GADTs + , GeneralizedNewtypeDeriving + , ImportQualifiedPost + , LambdaCase + , MultiParamTypeClasses + , OverloadedStrings + , QuasiQuotes + , RecordWildCards + , ScopedTypeVariables + , StandaloneDeriving + , TemplateHaskell + , TupleSections + , TypeApplications + , TypeFamilies + + + +library + + import: shared-properties + + ghc-options: + -Wall + -fno-warn-type-defaults + -O2 + "-with-rtsopts=-N4 -A64m -AL256m -I0" + + + exposed-modules: Data.Text.Fuzzy.Tokenize + , Data.Text.Fuzzy.Dates + , Data.Text.Fuzzy.Section + , Data.Text.Fuzzy.SExp + , Data.Text.Fuzzy.Attoparsec.Day + , Data.Text.Fuzzy.Attoparsec.Month + + build-depends: base + , attoparsec + , containers + , mtl + , prettyprinter + , safe + , streaming + , scientific + , text + , time + , microlens-platform + , uniplate + , unliftio + , unordered-containers + , timeit + + hs-source-dirs: src + + +executable fuzzy-sexp-parse + + import: shared-properties + default-language: GHC2021 + + ghc-options: + -Wall + -fno-warn-type-defaults + -O2 + + + main-is: FuzzySexpParse.hs + + hs-source-dirs: misc + + build-depends: base, fuzzy-parse + , containers + , hspec + , hspec-discover + , interpolatedstring-perl6 + , text + , mtl + , streaming + , transformers + , exceptions + , uniplate + , microlens-platform + , safe + , timeit + , prettyprinter + + +test-suite fuzzy-parse-test + import: shared-properties + default-language: GHC2021 + + + type: exitcode-stdio-1.0 + main-is: Spec.hs + other-modules: FuzzyParseSpec + hs-source-dirs: test + build-depends: base, fuzzy-parse + , containers + , hspec + , hspec-discover + , interpolatedstring-perl6 + , text + , mtl + , streaming + , transformers + , exceptions + , uniplate + , microlens-platform + , safe + , timeit + + build-tool-depends: hspec-discover:hspec-discover == 2.* + + diff --git a/misc/FuzzySexpParse.hs b/misc/FuzzySexpParse.hs new file mode 100644 index 00000000..e9a037fb --- /dev/null +++ b/misc/FuzzySexpParse.hs @@ -0,0 +1,30 @@ +module Main where + +import Data.Text.Fuzzy.SExp +import Data.Text.IO qualified as IO +import Data.Text qualified as Text +import Data.Either +import System.TimeIt +import Control.Monad.Except +import Data.Functor +import Data.Function +import Data.Fixed +import Prettyprinter +import System.IO + +main :: IO () +main = do + s <- IO.getContents + + (tt,toks) <- timeItT do + pure (tokenizeSexp s) + + (pt,top) <- timeItT do + runExceptT (parseTop @() s) <&> either (error.show) id + + print (vcat (fmap pretty top)) + + hPrint stderr $ pretty (Text.length s) <+> "chars, parsed in" <+> viaShow (realToFrac pt :: Fixed E6) + + + diff --git a/nix/derivations/.gitignore b/nix/derivations/.gitignore new file mode 100644 index 00000000..e69de29b diff --git a/nix/pkgs.json b/nix/pkgs.json new file mode 100644 index 00000000..5f553b6d --- /dev/null +++ b/nix/pkgs.json @@ -0,0 +1,7 @@ +{ + "url": "https://github.com/NixOS/nixpkgs.git", + "rev": "5cf0de2485efeccc307692eedadbb2d9bfdc7013", + "date": "2020-06-04T17:30:37+08:00", + "sha256": "07axrr50nlmnvba5ja2ihzjwczi66znak57bhcz472w22w7m3sd1", + "fetchSubmodules": false +} diff --git a/nix/pkgs.nix b/nix/pkgs.nix new file mode 100644 index 00000000..2ab3662c --- /dev/null +++ b/nix/pkgs.nix @@ -0,0 +1,5 @@ +import ((import {}).fetchFromGitHub { + owner = "NixOS"; + repo = "nixpkgs"; + inherit (builtins.fromJSON (builtins.readFile ./pkgs.json)) rev sha256; +}) diff --git a/nix/release.nix b/nix/release.nix new file mode 100644 index 00000000..f093ba4d --- /dev/null +++ b/nix/release.nix @@ -0,0 +1,29 @@ +let + pkgs = import ./pkgs.nix { inherit config; + }; + lib = pkgs.haskell.lib; + config = { + packageOverrides = pkgs: rec { + haskellPackages = pkgs.haskellPackages.override { overrides = haskOverrides; }; + }; + }; + gitignore = pkgs.callPackage (pkgs.fetchFromGitHub { + owner = "siers"; + repo = "nix-gitignore"; + rev = "ce0778ddd8b1f5f92d26480c21706b51b1af9166"; + sha256 = "1d7ab78i2k13lffskb23x8b5h24x7wkdmpvmria1v3wb9pcpkg2w"; + }) {}; + ignore = gitignore.gitignoreSourceAux '' + .stack-work + dist + dist-newstyle + .ghc.environment* + ''; + haskOverrides = new: old: + let overrides = lib.packagesFromDirectory { directory = ./derivations; } new old; + in overrides; +in rec { + inherit pkgs; + packages = { inherit (pkgs.haskellPackages) fuzzy-parse; + }; +} diff --git a/nix/update-nixpkgs.sh b/nix/update-nixpkgs.sh new file mode 100755 index 00000000..773d0b54 --- /dev/null +++ b/nix/update-nixpkgs.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env nix-shell +#! nix-shell -i bash -p nix-prefetch-git +nix-prefetch-git https://github.com/NixOS/nixpkgs.git | tee pkgs.json diff --git a/scripts/upload-docs.sh b/scripts/upload-docs.sh new file mode 100755 index 00000000..492fa288 --- /dev/null +++ b/scripts/upload-docs.sh @@ -0,0 +1,10 @@ +#!/bin/sh +set -e + +dir=$(mktemp -d dist-docs.XXXXXX) +trap 'rm -r "$dir"' EXIT + +# assumes cabal 2.4 or later +cabal v2-haddock --builddir="$dir" --haddock-for-hackage --enable-doc + +cabal upload -d --publish $dir/*-docs.tar.gz diff --git a/src/Data/Text/Fuzzy/Attoparsec/Day.hs b/src/Data/Text/Fuzzy/Attoparsec/Day.hs new file mode 100644 index 00000000..9058737a --- /dev/null +++ b/src/Data/Text/Fuzzy/Attoparsec/Day.hs @@ -0,0 +1,97 @@ +module Data.Text.Fuzzy.Attoparsec.Day ( dayDMY + , dayYMD + , dayYYYYMMDD + , dayDMonY + , day + ) where + +import Data.List (zipWith) +import Control.Applicative ((<|>)) +import Data.Attoparsec.Text (Parser,decimal,digit,count,satisfy,inClass,skipWhile) +import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength) +import qualified Data.Char as Char +import qualified Data.Text as Text + + + +day :: Parser Day +day = dayDMonY <|> dayYYYYMMDD <|> dayYMD <|> dayDMY + +skipDelim :: Parser () +skipDelim = skipWhile (inClass " ./-") + +dayDMY :: Parser Day +dayDMY = do + d <- decimal :: Parser Int + skipDelim + m <- decimal :: Parser Int + skipDelim + y' <- decimal :: Parser Integer + maybe (fail "bad date format") pure (makeDay y' m d) + +dayYMD :: Parser Day +dayYMD = do + y' <- decimal :: Parser Integer + skipDelim + m <- decimal :: Parser Int + skipDelim + d <- decimal :: Parser Int + maybe (fail "bad date format") pure (makeDay y' m d) + +dayYYYYMMDD :: Parser Day +dayYYYYMMDD = do + y <- fromIntegral . num n4 . map o <$> count 4 digit + m <- num n2 . map o <$> count 2 digit + d <- num n2 . map o <$> count 2 digit + maybe (fail "bad date format") pure (makeDay y m d) + where n4 = [1000,100,10,1] + n2 = [10,1] + o x = Char.ord x - Char.ord '0' + num n x = sum $ zipWith (*) x n + +dayDMonY :: Parser Day +dayDMonY = do + d <- decimal :: Parser Int + skipDelim + m <- pMon + skipDelim + y <- decimal :: Parser Integer + maybe (fail "bad date format") pure (makeDay y m d) + where + pMon :: Parser Int + pMon = do + txt <- Text.toUpper . Text.pack <$> count 3 (satisfy Char.isLetter) + case txt of + "JAN" -> pure 1 + "FEB" -> pure 2 + "MAR" -> pure 3 + "APR" -> pure 4 + "MAY" -> pure 5 + "JUN" -> pure 6 + "JUL" -> pure 7 + "AUG" -> pure 8 + "SEP" -> pure 9 + "OCT" -> pure 10 + "NOV" -> pure 11 + "DEC" -> pure 12 + _ -> fail "bad month name" + + +makeYear :: Integer -> Maybe Integer +makeYear y' = if y < 1900 && y' < 99 + then Nothing + else pure y + where + y = if y' < 50 + then y' + 2000 + else (if y' >= 50 && y' <= 99 + then y' + 1900 + else y' ) + +makeDay :: Integer -> Int -> Int -> Maybe Day +makeDay y m d | m <= 12 && m > 0 = + makeYear y >>= \yyyy -> if d <= gregorianMonthLength yyyy m + then pure $ fromGregorian yyyy m d + else Nothing + + | otherwise = Nothing diff --git a/src/Data/Text/Fuzzy/Attoparsec/Month.hs b/src/Data/Text/Fuzzy/Attoparsec/Month.hs new file mode 100644 index 00000000..42596a13 --- /dev/null +++ b/src/Data/Text/Fuzzy/Attoparsec/Month.hs @@ -0,0 +1,46 @@ +module Data.Text.Fuzzy.Attoparsec.Month ( fuzzyMonth, fuzzyMonthFromText + ) where + +import Control.Applicative ((<|>)) +import Data.Attoparsec.Text (Parser,decimal,digit,letter,many1,parseOnly) +import Data.Map (Map) +import Data.Maybe +import Data.Text (Text) +import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength) +import qualified Data.Char as Char +import qualified Data.Map as Map +import qualified Data.Text as Text + + +fuzzyMonth :: Parser Int +fuzzyMonth = pMonthNum <|> pMonth + +fuzzyMonthFromText :: Text -> Maybe Int +fuzzyMonthFromText = either (const Nothing) Just . parseOnly fuzzyMonth + +pMonthNum :: Parser Int +pMonthNum = do + n <- decimal + if n >= 1 && n <= 13 + then pure n + else fail "invalid months num" + +pMonth :: Parser Int +pMonth = do + mo <- many1 (Char.toLower <$> letter) + maybe (fail "invalid month name") pure (Map.lookup mo months) + where + months :: Map String Int + months = Map.fromList [ ("jan", 1), ("january" , 1) + , ("feb", 2), ("febuary" , 2) + , ("mar", 3), ("march" , 3) + , ("apr", 4), ("april" , 4) + , ("may", 5), ("may" , 5) + , ("jun", 6), ("june" , 6) + , ("jul", 7), ("july" , 7) + , ("aug", 8), ("august" , 8) + , ("sep", 9), ("september", 9) + , ("oct", 10), ("october" , 10) + , ("nov", 11), ("november" , 11) + , ("dec", 12), ("december" , 12) + ] diff --git a/src/Data/Text/Fuzzy/Dates.hs b/src/Data/Text/Fuzzy/Dates.hs new file mode 100644 index 00000000..9aa62c80 --- /dev/null +++ b/src/Data/Text/Fuzzy/Dates.hs @@ -0,0 +1,46 @@ +-- | +-- Module : Data.Text.Fuzzy.Dates +-- Copyright : Dmitry Zuikov 2020 +-- License : MIT +-- +-- Maintainer : dzuikov@gmail.com +-- Stability : experimental +-- Portability : unknown +-- +-- Dates fuzzy parsing. +-- Supports a number of dates format and tries to recover +-- the incomplete dates from text, with use of some +-- reasonable assumptions. Does not support locales, +-- i.e assums only English for dates yet. +-- +-- == Examples +-- +-- > parseMaybeDay "01.01.1979" +-- > Just 1979-01-01 +-- > parseMaybeDay "01.01.01" +-- > Just 2001-01-01 +-- > parseMaybeDay "13/01/2019" +-- > Just 2019-01-13 +-- > parseMaybeDay "2019-12-1" +-- > Just 2019-12-01 +-- > parseMaybeDay "21-feb-79" +-- > Just 1979-02-21 +-- > parseMaybeDay "21-feb-01" +-- > Just 2001-02-21 +-- > parseMaybeDay "29feb04" +-- > Just 2004-02-29 +-- > parseMaybeDay "21feb28" +-- > Just 2028-02-21 + +module Data.Text.Fuzzy.Dates where + +import Data.Attoparsec.Text (parseOnly) +import Data.Either (either) +import Data.Text.Fuzzy.Attoparsec.Day +import Data.Text (Text) +import Data.Time.Calendar + +-- | Tries to parse a date from the text. +parseMaybeDay :: Text -> Maybe Day +parseMaybeDay s = either (const Nothing) pure (parseOnly day s) + diff --git a/src/Data/Text/Fuzzy/SExp.hs b/src/Data/Text/Fuzzy/SExp.hs new file mode 100644 index 00000000..c6e21dba --- /dev/null +++ b/src/Data/Text/Fuzzy/SExp.hs @@ -0,0 +1,378 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE ExtendedDefaultRules #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE ImportQualifiedPost #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE TemplateHaskell #-} +module Data.Text.Fuzzy.SExp where + +import Data.Text (Text) + +import Control.Applicative +import Control.Monad +import Data.Function +import Data.Functor +import Data.Text.Fuzzy.Tokenize +import Control.Monad.Reader +import Data.Typeable +import Control.Monad.Except +import Control.Monad.RWS +import Data.Maybe +import Data.Char (isSpace,digitToInt) +import Data.Generics.Uniplate.Data() +import Safe +import Data.Data +import GHC.Generics +import Lens.Micro.Platform +import Data.Text qualified as Text +import Data.Coerce +import Data.Scientific + +import Data.HashMap.Strict (HashMap) +import Data.HashMap.Strict qualified as HM + +import Prettyprinter hiding (braces,list) + + +import Streaming.Prelude qualified as S + +data TTok = TChar Char + | TSChar Char + | TPunct Char + | TText Text + | TStrLit Text + | TKeyword Text + | TEmpty + | TIndent Int + deriving stock (Eq,Ord,Show,Data,Generic) + +instance IsToken TTok where + mkChar = TChar + mkSChar = TSChar + mkPunct = TPunct + mkText = TText + mkStrLit = TStrLit + mkKeyword = TKeyword + mkEmpty = TEmpty + mkIndent = TIndent + +newtype C0 = C0 (Maybe Int) + deriving stock (Eq,Ord,Show,Data,Typeable,Generic) + +data SExpParseError = + ParensOver C0 + | ParensUnder C0 + | ParensUnmatched C0 + | SyntaxError C0 + deriving stock (Show,Typeable) + + +data NumType = + NumInteger Integer + | NumDouble Scientific + deriving stock (Eq,Ord,Show,Data,Generic) + +class Monoid c => ForMicroSexp c where + +instance Monoid C0 where + mempty = C0 Nothing + +instance Semigroup C0 where + (<>) (C0 a) (C0 b) = C0 (b <|> a) + +instance ForMicroSexp C0 where + + +instance ForMicroSexp () where + +data MicroSexp c = + List_ c [MicroSexp c] + | Symbol_ c Text + | String_ c Text + | Number_ c NumType + | Boolean_ c Bool + deriving stock (Show,Data,Generic) + +pattern List :: ForMicroSexp c => [MicroSexp c] -> MicroSexp c +pattern List xs <- List_ _ xs where + List xs = List_ mempty xs + +pattern Symbol :: ForMicroSexp c => Text -> MicroSexp c +pattern Symbol xs <- Symbol_ _ xs where + Symbol xs = Symbol_ mempty xs + +pattern String :: ForMicroSexp c => Text -> MicroSexp c +pattern String x <- String_ _ x where + String x = String_ mempty x + +pattern Number :: ForMicroSexp c => NumType -> MicroSexp c +pattern Number n <- Number_ _ n where + Number n = Number_ mempty n + +pattern Boolean :: ForMicroSexp c => Bool -> MicroSexp c +pattern Boolean b <- Boolean_ _ b where + Boolean b = Boolean_ mempty b + +{-# COMPLETE List, Symbol, String, Number, Boolean #-} + + +contextOf :: Lens (MicroSexp c) (MicroSexp c) c c +contextOf = lens g s + where + s sexp c = case sexp of + List_ _ a -> List_ c a + Symbol_ _ a -> Symbol_ c a + String_ _ a -> String_ c a + Number_ _ a -> Number_ c a + Boolean_ _ a -> Boolean_ c a + + g = \case + List_ c _ -> c + Symbol_ c _ -> c + String_ c _ -> c + Number_ c _ -> c + Boolean_ c _ -> c + +nil :: forall c . ForMicroSexp c => MicroSexp c +nil = List [] + +symbol :: forall c . ForMicroSexp c => Text -> MicroSexp c +symbol = Symbol + +str :: forall c . ForMicroSexp c => Text -> MicroSexp c +str = String + +newtype SExpEnv = + SExpEnv + { sexpTranslate :: Bool + } + +data SExpState = + SExpState + { _sexpLno :: Int + , _sexpBraces :: [Char] + } + +makeLenses 'SExpState + +defEnv :: SExpEnv +defEnv = SExpEnv True + +newtype SExpM m a = SExpM { fromSexpM :: RWST SExpEnv () SExpState m a } + deriving newtype + ( Applicative + , Functor + , Monad + , MonadState SExpState + , MonadReader SExpEnv + , MonadTrans + ) + + +instance MonadError SExpParseError m => MonadError SExpParseError (SExpM m) where + throwError = lift . throwError + catchError w = catchError (coerce $ fromSexpM w) + +tokenizeSexp :: Text -> [TTok] +tokenizeSexp txt = do + let spec = delims " \r\t" <> comment ";" + <> punct "'{}()[]\n" + <> sqq + <> uw + tokenize spec txt + +runSexpM :: Monad m => SExpM m a -> m a +runSexpM f = evalRWST (fromSexpM f) defEnv (SExpState 0 []) <&> fst + + +parseSexp :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m (MicroSexp c) +parseSexp txt = do + (s, _) <- runSexpM do + (s,rest) <- sexp (tokenizeSexp txt) + checkBraces + pure (s,rest) + + pure s + +checkBraces :: (MonadError SExpParseError m) => SExpM m () +checkBraces = do + braces <- gets (view sexpBraces) + unless (null braces) $ raiseWith ParensUnder + +succLno :: (MonadError SExpParseError m) => SExpM m () +succLno = modify (over sexpLno succ) + +parseTop :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m [MicroSexp c] +parseTop txt = do + let tokens = tokenizeSexp txt + S.toList_ $ runSexpM do + flip fix (mempty,tokens) $ \next -> \case + (acc, []) -> do + emit acc + (acc, TPunct '\n' : rest) -> do + succLno + emit acc + next (mempty,rest) + (acc, rest) -> do + (s, xs) <- sexp rest + next (acc <> [s],xs) + + where + + emit [] = pure () + emit wtf = case wtf of + [List one] -> lift $ S.yield (List one) + xs -> lift $ S.yield (List xs) + +sexp :: (ForMicroSexp c, MonadError SExpParseError m) => [TTok] -> SExpM m (MicroSexp c, [TTok]) +sexp s = case s of + [] -> do + checkBraces + pure (nil, mempty) + + (TText l : w) -> (,w) <$> trNum (Symbol l) + + (TStrLit l : w) -> pure (String l, w) + + -- so far ignored + (TPunct '\'' : rest) -> sexp rest + + (TPunct '\n' : rest) -> succLno >> sexp rest + + (TPunct c : rest) | isSpace c -> sexp rest + + (TPunct c : rest) | isBrace c -> + maybe (pure (nil, rest)) (`list` rest) (closing c) + | otherwise -> do + raiseWith ParensOver + + ( _ : _ ) -> raiseWith SyntaxError + + where + + setContext w = do + co <- getC0 + pure $ over _2 (set contextOf co) w + + isBrace :: Char -> Bool + isBrace c = HM.member c braces + + closing :: Char -> Maybe Char + closing c = HM.lookup c braces + + braces :: HashMap Char Char + braces = HM.fromList[ ('{', '}') + , ('(', ')') + , ('[', ']') + , ('<', '>') + ] + + cBraces :: [Char] + cBraces = HM.elems braces + + trNum tok = do + + trans <- asks sexpTranslate + + case tok of + Symbol s | trans -> do + let s0 = Text.unpack s + + let what = Number . NumInteger <$> readMay @Integer s0 + <|> + Number . NumInteger <$> parseBinary s0 + <|> + Number . NumDouble <$> readMay @Scientific s0 + <|> + ( case s of + "#t" -> Just (Boolean True) + "#f" -> Just (Boolean False) + _ -> Nothing + ) + + pure $ fromMaybe (Symbol s) what + + + x -> pure x + {-# INLINE trNum #-} + + list :: (ForMicroSexp c, MonadError SExpParseError m) + => Char + -> [TTok] + -> SExpM m (MicroSexp c, [TTok]) + + list _ [] = raiseWith ParensUnder + + list cb tokens = do + modify $ over sexpBraces (cb:) + + go cb mempty tokens + + where + + isClosingFor :: Char -> Bool + isClosingFor c = c `elem` cBraces + + go _ _ [] = do + checkBraces + pure (List mempty, mempty) + + go cl acc (TPunct c : rest) | isSpace c = do + go cl acc rest + + go cl acc (TPunct c : rest) + | isClosingFor c && c == cl = do + modify $ over sexpBraces (drop 1) + pure (List (reverse acc), rest) + + | isClosingFor c && c /= cl = do + raiseWith ParensUnmatched + -- throwError =<< ParensUnmatched <$> undefined + + go cl acc rest = do + (e,r) <- sexp rest + go cl (e : acc) r + + +getC0 :: Monad m => SExpM m C0 +getC0 = do + lno <- gets (view sexpLno) + pure (C0 (Just lno)) + +raiseWith :: (MonadError SExpParseError m) + => (C0 -> SExpParseError) -> SExpM m b + +raiseWith a = throwError =<< a <$> getC0 + +instance Pretty NumType where + pretty = \case + NumInteger n -> pretty n + NumDouble n -> viaShow n + +instance ForMicroSexp c => Pretty (MicroSexp c) where + + pretty = \case + List xs -> parens (hsep (fmap pretty xs)) + String s -> dquotes (pretty s) + Symbol s -> pretty s + Number n -> pretty n + Boolean True -> pretty "#t" + Boolean False -> pretty "#f" + +isBinaryDigit :: Char -> Bool +isBinaryDigit c = c == '0' || c == '1' + +parseBinary :: String -> Maybe Integer +parseBinary str = + let + withoutPrefix = case str of + '0':'b':rest -> Just rest + '0':'B':rest -> Just rest + _ -> Nothing + in if isJust withoutPrefix && all isBinaryDigit (fromJust withoutPrefix) + then Just $ foldl (\acc x -> acc * 2 + toInteger (digitToInt x)) 0 (fromJust withoutPrefix) + else Nothing + diff --git a/src/Data/Text/Fuzzy/Section.hs b/src/Data/Text/Fuzzy/Section.hs new file mode 100644 index 00000000..c7e8166d --- /dev/null +++ b/src/Data/Text/Fuzzy/Section.hs @@ -0,0 +1,15 @@ +module Data.Text.Fuzzy.Section (cutSectionBy, cutSectionOn) where + +import Data.Text (Text) +import qualified Data.List as List + + +cutSectionOn :: Text -> Text -> [Text] -> [Text] +cutSectionOn a b txt = cutSectionBy ((==)a) ((==b)) txt + +cutSectionBy :: (Text -> Bool) -> (Text -> Bool) -> [Text] -> [Text] +cutSectionBy a b txt = cutI + where + cutC = List.dropWhile (not . a) txt + cutI = List.takeWhile (not . b) cutC + diff --git a/src/Data/Text/Fuzzy/Tokenize.hs b/src/Data/Text/Fuzzy/Tokenize.hs new file mode 100644 index 00000000..a8bfa6d1 --- /dev/null +++ b/src/Data/Text/Fuzzy/Tokenize.hs @@ -0,0 +1,556 @@ +-- | +-- Module : Data.Text.Fuzzy.Tokenize +-- Copyright : Dmitry Zuikov 2020 +-- License : MIT +-- +-- Maintainer : dzuikov@gmail.com +-- Stability : experimental +-- Portability : unknown +-- +-- The lightweight and multi-functional text tokenizer allowing different types of text tokenization +-- depending on it's settings. +-- +-- It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars +-- easier and sometimes faster than in case of usage mainstream parsing combinators or parser +-- generators. +-- +-- The primary goal of this package is to parse unstructured text data, however it may be used for +-- parsing such data formats as CSV with ease. +-- +-- Currently it supports the following types of entities: atoms, string literals (currently with the +-- minimal set of escaped characters), punctuation characters and delimeters. +-- +-- == Examples +-- === Simple CSV-like tokenization +-- >>> tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text] +-- ["aaa "," bebeb "," qqq "] +-- +-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text] +-- ["aaa "," bebeb "," qqq ","","","",""] +-- +-- >>>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text] +-- [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] +-- +-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] +-- [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] +-- +-- >>> let spec = sl<>delims ":"<>sq<>emptyFields<>noslits +-- >>> tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] +-- [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing] +-- +-- >>> let spec = delims ":"<>sq<>emptyFields<>uw<>noslits +-- >>> tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] +-- [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing] +-- +-- == Notes +-- +-- === About the delimeter tokens +-- This type of tokens appears during a "delimited" +-- formats processing and disappears in results. Currenly +-- you will never see it unless normalization is turned off by 'nn' option. +-- +-- The delimeters make sense in case of processing the CSV-like formats, +-- but in this case you probably need only values in results. +-- +-- This behavior may be changed later. But right now delimeters seem pointless +-- in results. If you process some sort of grammar where delimeter character +-- is important, you may use punctuation instead, i.e: +-- +-- >>> let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq +-- >>> tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text] +-- ["(","delimeters",",","are",",","important",",","spaces are not",")",";"] +-- +-- == Other +-- For CSV-like formats it makes sense to split text to lines first, +-- otherwise newline characters may cause to weird results +-- +-- + +module Data.Text.Fuzzy.Tokenize ( TokenizeSpec + , IsToken(..) + , tokenize + , esc + , addEmptyFields + , emptyFields + , nn + , sq + , sqq + , noslits + , sl + , sr + , uw + , delims + , comment + , punct + , indent + , itabstops + , keywords + , eol + ) where + +import Prelude hiding (init) + +import Control.Applicative +import Data.Map (Map) +import Data.Maybe (fromMaybe) +import Data.Monoid() +import Data.Set (Set) +import Data.Text (Text) +import qualified Data.List as List +import qualified Data.Map as Map +import qualified Data.Set as Set +import qualified Data.Text as Text + +import Control.Monad (when) +import Control.Monad.RWS + +-- | Tokenization settings. Use mempty for an empty value +-- and construction functions for changing the settings. +-- +data TokenizeSpec = TokenizeSpec { tsAtoms :: Set Text + , tsStringQQ :: Maybe Bool + , tsStringQ :: Maybe Bool + , tsNoSlits :: Maybe Bool + , tsLineComment :: Map Char Text + , tsDelims :: Set Char + , tsEol :: Maybe Bool + , tsStripLeft :: Maybe Bool + , tsStripRight :: Maybe Bool + , tsUW :: Maybe Bool + , tsNotNormalize :: Maybe Bool + , tsEsc :: Maybe Bool + , tsAddEmptyFields :: Maybe Bool + , tsPunct :: Set Char + , tsIndent :: Maybe Bool + , tsItabStops :: Maybe Int + , tsKeywords :: Set Text + } + deriving (Eq,Ord,Show) + + +instance Semigroup TokenizeSpec where + (<>) a b = TokenizeSpec { tsAtoms = tsAtoms b <> tsAtoms a + , tsStringQQ = tsStringQQ b <|> tsStringQQ a + , tsStringQ = tsStringQ b <|> tsStringQ a + , tsNoSlits = tsNoSlits b <|> tsNoSlits a + , tsLineComment = tsLineComment b <> tsLineComment a + , tsDelims = tsDelims b <> tsDelims a + , tsEol = tsEol b <|> tsEol a + , tsStripLeft = tsStripLeft b <|> tsStripLeft a + , tsStripRight = tsStripRight b <|> tsStripRight a + , tsUW = tsUW b <|> tsUW a + , tsNotNormalize = tsNotNormalize b <|> tsNotNormalize a + , tsEsc = tsEsc b <|> tsEsc a + , tsAddEmptyFields = tsAddEmptyFields b <|> tsAddEmptyFields a + , tsPunct = tsPunct b <> tsPunct a + , tsIndent = tsIndent b <|> tsIndent a + , tsItabStops = tsItabStops b <|> tsItabStops a + , tsKeywords = tsKeywords b <> tsKeywords a + } + +instance Monoid TokenizeSpec where + mempty = TokenizeSpec { tsAtoms = mempty + , tsStringQQ = Nothing + , tsStringQ = Nothing + , tsNoSlits = Nothing + , tsLineComment = mempty + , tsDelims = mempty + , tsEol = Nothing + , tsStripLeft = Nothing + , tsStripRight = Nothing + , tsUW = Nothing + , tsNotNormalize = Nothing + , tsEsc = Nothing + , tsAddEmptyFields = Nothing + , tsPunct = mempty + , tsIndent = Nothing + , tsItabStops = Nothing + , tsKeywords = mempty + } + + +justTrue :: Maybe Bool -> Bool +justTrue (Just True) = True +justTrue _ = False + +-- | Turns on EOL token generation +eol :: TokenizeSpec +eol = mempty { tsEol = pure True } + +-- | Turn on character escaping inside string literals. +-- Currently the following escaped characters are +-- supported: [" ' \ t n r a b f v ] +esc :: TokenizeSpec +esc = mempty { tsEsc = pure True } + +-- | Raise empty field tokens (note mkEmpty method) +-- when no tokens found before a delimeter. +-- Useful for processing CSV-like data in +-- order to distingush empty columns +addEmptyFields :: TokenizeSpec +addEmptyFields = mempty { tsAddEmptyFields = pure True } + +-- | same as addEmptyFields +emptyFields :: TokenizeSpec +emptyFields = addEmptyFields + +-- | Turns off token normalization. Makes the tokenizer +-- generate character stream. Useful for debugging. +nn :: TokenizeSpec +nn = mempty { tsNotNormalize = pure True } + +-- | Turns on single-quoted string literals. +-- Character stream after '\'' character +-- will be proceesed as single-quoted stream, +-- assuming all delimeter, comment and other special +-- characters as a part of the string literal until +-- the next unescaped single quote character. +sq :: TokenizeSpec +sq = mempty { tsStringQ = pure True } + +-- | Enable double-quoted string literals support +-- as 'sq' for single-quoted strings. +sqq :: TokenizeSpec +sqq = mempty { tsStringQQ = pure True } + +-- | Disable separate string literals. +-- +-- Useful when processed delimeted data (csv-like formats). +-- Normally, sequential text chunks are concatenated together, +-- but consequent text and string literal will produce the two +-- different tokens and it may cause weird results if data +-- is in csv-like format, i.e: +-- +-- >>> tokenize (delims ":"<>emptyFields<>sq ) "aaa:bebe:'qq' aaa:next::" :: [Maybe Text] +-- [Just "aaa",Just "bebe",Just "qq",Just " aaa",Just "next",Nothing,Nothing] +-- +-- look: "qq" and " aaa" are turned into two separate tokens that makes the result +-- of CSV processing looks improper, like it has an extra-column. This behavior may be +-- avoided using this option, if you don't need to distinguish text chunks and string +-- literals: +-- +-- >>> tokenize (delims ":"<>emptyFields<>sq<>noslits) "aaa:bebe:'qq:foo' aaa:next::" :: [Maybe Text] +-- [Just "aaa",Just "bebe",Just "qq:foo aaa",Just "next",Nothing,Nothing] +-- +noslits :: TokenizeSpec +noslits = mempty { tsNoSlits = pure True } + +-- | Specify the list of delimers (characters) +-- to split the character stream into fields. Useful for CSV-like separated formats. Support for +-- empty fields in token stream may be enabled by 'addEmptyFields' function +delims :: String -> TokenizeSpec +delims s = mempty { tsDelims = Set.fromList s } + +-- | Strip spaces on left side of a token. +-- Does not affect string literals, i.e string are processed normally. Useful mostly for +-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces. +sl :: TokenizeSpec +sl = mempty { tsStripLeft = pure True } + +-- | Strip spaces on right side of a token. +-- Does not affect string literals, i.e string are processed normally. Useful mostly for +-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces. +sr :: TokenizeSpec +sr = mempty { tsStripRight = pure True } + +-- | Strips spaces on right and left sides and transforms multiple spaces into the one. +-- Name origins from unwords . words +-- +-- Does not affect string literals, i.e string are processed normally. Useful mostly for +-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces. +uw :: TokenizeSpec +uw = mempty { tsUW = pure True } + +-- | Specify the line comment prefix. +-- All text after the line comment prefix will +-- be ignored until the newline character appearance. +-- Multiple line comments are supported. +comment :: Text -> TokenizeSpec +comment s = mempty { tsLineComment = cmt } + where + cmt = case Text.uncons s of + Just (p,su) -> Map.singleton p su + Nothing -> mempty + +-- | Specify the punctuation characters. +-- Any punctuation character is handled as a separate +-- token. +-- Any token will be breaked on a punctiation character. +-- +-- Useful for handling ... er... punctuaton, like +-- +-- >> function(a,b) +-- +-- or +-- +-- >> (apply function 1 2 3) +-- +-- +-- >>> tokenize spec "(apply function 1 2 3)" :: [Text] +-- ["(","apply","function","1","2","3",")"] +-- +punct :: Text -> TokenizeSpec +punct s = mempty { tsPunct = Set.fromList (Text.unpack s) } + +-- | Specify the keywords list. +-- Each keyword will be threated as a separate token. +keywords :: [Text] -> TokenizeSpec +keywords s = mempty { tsKeywords = Set.fromList s } + +-- | Enable identation support +indent :: TokenizeSpec +indent = mempty { tsIndent = Just True } + +-- | Set tab expanding multiplier +-- i.e. each tab extends into n spaces before processing. +-- It also turns on the indentation. Only the tabs at the beginning of the string are expanded, +-- i.e. before the first non-space character appears. +itabstops :: Int -> TokenizeSpec +itabstops n = mempty { tsIndent = Just True, tsItabStops = pure n } + +newtype TokenizeM w a = TokenizeM (RWS TokenizeSpec w () a) + deriving( Applicative + , Functor + , MonadReader TokenizeSpec + , MonadWriter w + , MonadState () + , Monad + ) + +data Token = TChar Char + | TSChar Char + | TPunct Char + | TText Text + | TSLit Text + | TKeyword Text + | TEmpty + | TDelim + | TIndent Int + | TEol + deriving (Eq,Ord,Show) + +-- | Typeclass for token values. +-- Note, that some tokens appear in results +-- only when 'nn' option is set, i.e. sequences +-- of characters turn out to text tokens or string literals +-- and delimeter tokens are just removed from the +-- results +class IsToken a where + -- | Create a character token + mkChar :: Char -> a + -- | Create a string literal character token + mkSChar :: Char -> a + -- | Create a punctuation token + mkPunct :: Char -> a + -- | Create a text chunk token + mkText :: Text -> a + -- | Create a string literal token + mkStrLit :: Text -> a + -- | Create a keyword token + mkKeyword :: Text -> a + -- | Create an empty field token + mkEmpty :: a + -- | Create a delimeter token + mkDelim :: a + mkDelim = mkEmpty + + -- | Creates an indent token + mkIndent :: Int -> a + mkIndent = const mkEmpty + + -- | Creates an EOL token + mkEol :: a + mkEol = mkEmpty + +instance IsToken (Maybe Text) where + mkChar = pure . Text.singleton + mkSChar = pure . Text.singleton + mkPunct = pure . Text.singleton + mkText = pure + mkStrLit = pure + mkKeyword = pure + mkEmpty = Nothing + +instance IsToken Text where + mkChar = Text.singleton + mkSChar = Text.singleton + mkPunct = Text.singleton + mkText = id + mkStrLit = id + mkKeyword = id + mkEmpty = "" + +-- | Tokenize a text +tokenize :: IsToken a => TokenizeSpec -> Text -> [a] +tokenize s t = map tr t1 + where + t1 = tokenize' s t + tr (TChar c) = mkChar c + tr (TSChar c) = mkSChar c + tr (TText c) = mkText c + tr (TSLit c) = mkStrLit c + tr (TKeyword c) = mkKeyword c + tr TEmpty = mkEmpty + tr (TPunct c) = mkPunct c + tr TDelim = mkDelim + tr (TIndent n) = mkIndent n + tr TEol = mkEol + +execTokenizeM :: TokenizeM [Token] a -> TokenizeSpec -> [Token] +execTokenizeM (TokenizeM m) spec = + let (_,w) = execRWS m spec () in norm w + + where norm x | justTrue (tsNotNormalize spec) = x + | otherwise = normalize spec x + +tokenize' :: TokenizeSpec -> Text -> [Token] +tokenize' spec txt = execTokenizeM (root' txt) spec + where + + r = spec + + noIndent = not doIndent + doIndent = justTrue (tsIndent r) + eolOk = justTrue (tsEol r) + + root' x = scanIndent x >>= root + + root ts = do + + case Text.uncons ts of + Nothing -> pure () + + Just ('\n', rest) | doIndent -> raiseEol >> root' rest + Just (c, rest) | Set.member c (tsDelims r) -> tell [TDelim] >> root rest + Just ('\'', rest) | justTrue (tsStringQ r) -> scanQ '\'' rest + Just ('"', rest) | justTrue (tsStringQQ r) -> scanQ '"' rest + + Just (c, rest) | Map.member c (tsLineComment r) -> scanComment (c,rest) + + Just (c, rest) | Set.member c (tsPunct r) -> tell [TPunct c] >> root rest + + Just (c, rest) | otherwise -> tell [TChar c] >> root rest + + + raiseEol | eolOk = tell [TEol] + | otherwise = pure () + + expandSpace ' ' = 1 + expandSpace '\t' = (fromMaybe 8 (tsItabStops r)) + expandSpace _ = 0 + + scanIndent x | noIndent = pure x + | otherwise = do + let (ss,as) = Text.span (\c -> c == ' ' || c == '\t') x + tell [ TIndent (sum (map expandSpace (Text.unpack ss))) ] + pure as + + scanComment (c,rest) = do + suff <- Map.lookup c <$> asks tsLineComment + case suff of + Just t | Text.isPrefixOf t rest -> do + root $ Text.dropWhile ('\n' /=) rest + + _ -> tell [TChar c] >> root rest + + scanQ q ts = do + + case Text.uncons ts of + Nothing -> root ts + + Just ('\\', rest) | justTrue (tsEsc r) -> unesc (scanQ q) rest + | otherwise -> tell [tsChar '\\'] >> scanQ q rest + + Just (c, rest) | c == q -> root rest + | otherwise -> tell [tsChar c] >> scanQ q rest + + unesc f ts = + case Text.uncons ts of + Nothing -> f ts + Just ('"', rs) -> tell [tsChar '"' ] >> f rs + Just ('\'', rs) -> tell [tsChar '\''] >> f rs + Just ('\\', rs) -> tell [tsChar '\\'] >> f rs + Just ('t', rs) -> tell [tsChar '\t'] >> f rs + Just ('n', rs) -> tell [tsChar '\n'] >> f rs + Just ('r', rs) -> tell [tsChar '\r'] >> f rs + Just ('a', rs) -> tell [tsChar '\a'] >> f rs + Just ('b', rs) -> tell [tsChar '\b'] >> f rs + Just ('f', rs) -> tell [tsChar '\f'] >> f rs + Just ('v', rs) -> tell [tsChar '\v'] >> f rs + Just (_, rs) -> f rs + + tsChar c | justTrue (tsNoSlits spec) = TChar c + | otherwise = TSChar c + +newtype NormStats = NormStats { nstatBeforeDelim :: Int } + +normalize :: TokenizeSpec -> [Token] -> [Token] +normalize spec tokens = snd $ execRWS (go tokens) () init + where + + go [] = addEmptyField + + go s@(TIndent _ : _) = do + let (iis, rest') = List.span isIndent s + tell [TIndent (sum [k | TIndent k <- iis])] + go rest' + + go (TChar c0 : cs) = do + let (n,ns) = List.span isTChar cs + succStat + let chunk = eatSpaces $ Text.pack (c0 : [ c | TChar c <- n]) + let kw = Set.member chunk (tsKeywords spec) + tell [ if kw then TKeyword chunk else TText chunk ] + go ns + + go (TSChar x : xs) = do + let (n,ns) = List.span isTSChar xs + succStat + tell [ TSLit $ Text.pack (x : [ c | TSChar c <- n]) ] + go ns + + go (TDelim : xs) = do + addEmptyField + pruneStat + go xs + + go (TPunct c : xs) = do + tell [ TPunct c ] + succStat + go xs + + go (x:xs) = tell [x] >> go xs + + succStat = do + modify (\x -> x { nstatBeforeDelim = succ (nstatBeforeDelim x)}) + + pruneStat = do + modify (\x -> x { nstatBeforeDelim = 0 } ) + + addEmptyField = do + ns <- gets nstatBeforeDelim + when (ns == 0 && justTrue (tsAddEmptyFields spec) ) $ do + tell [ TEmpty ] + + isTChar (TChar _) = True + isTChar _ = False + + isTSChar (TSChar _) = True + isTSChar _ = False + + isIndent (TIndent _) = True + isIndent _ = False + + init = NormStats { nstatBeforeDelim = 0 } + + eatSpaces s | sboth = Text.strip s + | sLonly = Text.stripStart s + | sRonly = Text.stripEnd s + | sWU = (Text.unwords . Text.words) s + | otherwise = s + + where sboth = justTrue (tsStripLeft spec) && justTrue (tsStripRight spec) + sLonly = justTrue (tsStripLeft spec) && not (justTrue (tsStripRight spec)) + sRonly = not (justTrue (tsStripLeft spec)) && justTrue (tsStripRight spec) + sWU = justTrue (tsUW spec) + diff --git a/src/Data/Text/LSH.hs b/src/Data/Text/LSH.hs new file mode 100644 index 00000000..10ccfc2d --- /dev/null +++ b/src/Data/Text/LSH.hs @@ -0,0 +1 @@ +module Data.Text.LSH where \ No newline at end of file diff --git a/test/FuzzyParseSpec.hs b/test/FuzzyParseSpec.hs new file mode 100644 index 00000000..bd868343 --- /dev/null +++ b/test/FuzzyParseSpec.hs @@ -0,0 +1,150 @@ +{-# LANGUAGE OverloadedStrings + , QuasiQuotes + , ExtendedDefaultRules + , LambdaCase + , ImportQualifiedPost + , DerivingStrategies + , PatternSynonyms + , ViewPatterns + , MultiWayIf + , TemplateHaskell +#-} + +module FuzzyParseSpec (spec) where + +import Data.Text (Text) +import Test.Hspec +import Text.InterpolatedString.Perl6 (q) + +import Data.Text.Fuzzy.Tokenize +import Data.Data +import Data.Generics.Uniplate.Data() +import GHC.Generics + +data TTok = TChar Char + | TSChar Char + | TPunct Char + | TText Text + | TStrLit Text + | TKeyword Text + | TEmpty + | TIndent Int + deriving stock (Eq,Ord,Show,Data,Generic) + +instance IsToken TTok where + mkChar = TChar + mkSChar = TSChar + mkPunct = TPunct + mkText = TText + mkStrLit = TStrLit + mkKeyword = TKeyword + mkEmpty = TEmpty + mkIndent = TIndent + + + +spec :: Spec +spec = do + describe "csv-like" $ do + it "splits text using ':' delimeter" $ do + let toks = tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text] + toks `shouldBe` ["aaa "," bebeb "," qqq "] + + it "splits text using ':' delimeter with single-quotes string and empty fields" $ do + let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text] + toks `shouldBe` ["aaa "," bebeb "," qqq ","","","",""] + + it "splits text using ':' delimeter with single-quotes string and empty fields" $ do + let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + toks `shouldBe` [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] + + it "splits text using ':' delimeter with single-quotes string and empty fields with noslits" $ do + let spec = sl<>delims ":"<>sq<>emptyFields<>noslits + let toks = tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + toks `shouldBe` [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing] + + it "splits text using ':' delimeter with single-quotes string and empty fields with noslits and uw" $ do + let spec = delims ":"<>sq<>emptyFields<>uw<>noslits + let toks = tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + toks `shouldBe` [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing] + + it "uses punctuation tokens" $ do + let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq + let toks = tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text] + toks `shouldBe` ["(","delimeters",",","are",",","important",",","spaces are not",")",";"] + + + it "tokenize simple lisp-like text with keywords" $ do + let spec = delims " \n\t" <> comment ";" + <> punct "{}()[]<>" + <> sq <> sqq + <> uw + <> keywords ["define","apply","+"] + + let code = [q| + (define add (a b ) ; define simple function + (+ a b) ) + (define r (add 10 20)) +|] + + let toks = tokenize spec code :: [TTok] + + let expected = [ TPunct '(' + , TKeyword "define" + , TText "add" , TPunct '(', TText "a" , TText "b", TPunct ')' + , TPunct '(', TKeyword "+", TText "a",TText "b",TPunct ')',TPunct ')' + , TPunct '(',TKeyword "define" + ,TText "r" + ,TPunct '(',TText "add",TText "10",TText "20" + ,TPunct ')',TPunct ')'] + + toks `shouldBe` expected + + + describe "Checks indentation support" $ do + + let spec = delims " \n\t" <> comment ";" + <> punct "{}()[]<>" + <> sq <> sqq + <> uw + <> indent + <> itabstops 8 + <> keywords ["define"] + + + + it "parses some indented blocks" $ do + + let expected = [ TIndent 0, TKeyword "define", TText "a", TText "0" + , TIndent 2, TText "atom", TText "foo", TText "2" + , TIndent 2, TKeyword "define", TText "aq", TText "2" + , TIndent 4, TText "atom", TText "one", TText "4" + , TIndent 4, TText "atom", TText "two", TText "4" + , TIndent 0, TKeyword "define", TText "b", TText "0" + , TIndent 2, TText "atom", TText "baar", TText "2" + , TIndent 2, TText "atom", TText "quux", TText "2" + , TIndent 2, TKeyword "define", TText "new", TText "2" + , TIndent 6, TText "atom", TText "bar", TText "6" + , TIndent 4, TText "atom", TText "fuu", TText "4" + , TIndent 0 + ] + + let pyLike = [q| +define a 0 + atom foo 2 + define aq 2 + atom one 4 + atom two 4 + +define b 0 + atom baar 2 + atom quux 2 + define new 2 + atom bar 6 + atom fuu 4 + +|] + let toks = tokenize spec pyLike :: [TTok] + toks `shouldBe` expected + + diff --git a/test/Spec.hs b/test/Spec.hs new file mode 100644 index 00000000..a824f8c3 --- /dev/null +++ b/test/Spec.hs @@ -0,0 +1 @@ +{-# OPTIONS_GHC -F -pgmF hspec-discover #-}