commit cf85d2df2af00bf8c8d59666aa16ee4a85a3ba20 Author: voidlizard Date: Mon Oct 7 05:06:03 2024 +0300 Squashed 'miscellaneous/fuzzy-parse/' content from commit a834b152e git-subtree-dir: miscellaneous/fuzzy-parse git-subtree-split: a834b152e29d632c816eefe117036e5d9330bd03 diff --git a/.envrc b/.envrc new file mode 100644 index 00000000..ce34212a --- /dev/null +++ b/.envrc @@ -0,0 +1,5 @@ +if [ -f .envrc.local ]; then + source_env .envrc.local +fi + +use flake diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..7a25a854 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.swp +dist-newstyle/ +Setup.hs + +.direnv +.hbs2-git/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..374e936c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Revision history for fuzzy-parse + +## 0.1.2.0 + + - Techical release + - Added some missed things + +## 0.1.0.0 -- YYYY-mm-dd + +* First version. Released on an unsuspecting world. diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..232522e1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2019 Dmitry Zuikov + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.markdown b/README.markdown new file mode 100644 index 00000000..30eb7e38 --- /dev/null +++ b/README.markdown @@ -0,0 +1,131 @@ +# About + +# Data.Text.Fuzzy.Tokenize + +The lightweight and multi-functional text tokenizer allowing different types of text tokenization +depending on it's settings. + +It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars +easier and sometimes faster than in case of usage mainstream parsing combinators or parser +generators. + +The primary goal of this package is to parse unstructured text data, however it may be used for +parsing such data formats as CSV with ease. + +Currently it supports the following types of entities: atoms, string literals (currently with the +minimal set of escaped characters), punctuation characters and delimeters. + +## Examples + +### Simple CSV-like tokenization + +```haskell +tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text] + +["aaa "," bebeb "," qqq "] +``` + +```haskell +tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text] + +["aaa "," bebeb "," qqq ","","","",""] +``` + +```haskell +tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text] + +[Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] +``` + +```haskell +tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + +[Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] +``` + +```haskell +let spec = sl<>delims ":"<>sq<>emptyFields<>noslits +tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + +[Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing] +``` + +```haskell +let spec = delims ":"<>sq<>emptyFields<>uw<>noslits +tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + +[Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing] +``` + +### Primitive lisp-like language + +```haskell +{-# LANGUAGE QuasiQuotes, ExtendedDefaultRules #-} + +import Text.InterpolatedString.Perl6 (q) +import Data.Text.Fuzzy.Tokenize + +data TTok = TChar Char + | TSChar Char + | TPunct Char + | TText Text + | TStrLit Text + | TKeyword Text + | TEmpty + deriving(Eq,Ord,Show) + +instance IsToken TTok where + mkChar = TChar + mkSChar = TSChar + mkPunct = TPunct + mkText = TText + mkStrLit = TStrLit + mkKeyword = TKeyword + mkEmpty = TEmpty + +main = do + + let spec = delims " \n\t" <> comment ";" + <> punct "{}()[]<>" + <> sq <> sqq + <> uw + <> keywords ["define","apply","+"] + let code = [q| + (define add (a b ) ; define simple function + (+ a b) ) + (define r (add 10 20)) +|] + let toks = tokenize spec code :: [TTok] + + print toks +``` + +## Notes + +### About the delimeter tokens +This type of tokens appears during a "delimited" formats processing and disappears in results. +Currenly you will never see it unless normalization is turned off by 'nn' option. + +The delimeters make sense in case of processing the CSV-like formats, but in this case you probably +need only values in results. + +This behavior may be changed later. But right now delimeters seem pointless in results. If you +process some sort of grammar where delimeter character is important, you may use punctuation +instead, i.e: + +```haskell +let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq +tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text] + +["(","delimeters",",","are",",","important",",","spaces are not",")",";"] +``` + +### Other +For CSV-like formats it makes sense to split text to lines first, otherwise newline characters may +cause to weird results + + +# Authors + +This library is written and maintained by Dmitry Zuikov, dzuikov@gmail.com + diff --git a/TODO b/TODO new file mode 100644 index 00000000..fc8195fe --- /dev/null +++ b/TODO @@ -0,0 +1,12 @@ + + +- [ ] TODO: Tests for Data.Text.Fuzzy.Section +- [ ] TODO: haddock for Data.Text.Fuzzy.Section + +- [~] TODO: Tests +- [+] TODO: Freeze dependencies versions +- [ ] TODO: Version number +- [+] TODO: Haddocks +- [ ] TODO: Tokenizer: Identation support +- [ ] TODO: Tokenizer: Block comments support + diff --git a/cabal.project b/cabal.project new file mode 100644 index 00000000..b6dfabf9 --- /dev/null +++ b/cabal.project @@ -0,0 +1,6 @@ +packages: *.cabal + +allow-newer: all + +tests: True + diff --git a/flake.lock b/flake.lock new file mode 100644 index 00000000..5aee32b5 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "locked": { + "lastModified": 1644229661, + "narHash": "sha256-1YdnJAsNy69bpcjuoKdOYQX0YxZBiCYZo4Twxerqv7k=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "3cecb5b042f7f209c56ffd8371b2711a290ec797", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "haskell-flake-utils": { + "inputs": { + "flake-utils": "flake-utils" + }, + "locked": { + "lastModified": 1707809372, + "narHash": "sha256-wfTL9PlCSOqSSyU4eenFFI7pHrV21gba4GEILnI4nAU=", + "owner": "ivanovs-4", + "repo": "haskell-flake-utils", + "rev": "3cbdc5d6093e8b4464ae64097e0c8c61e4414ff2", + "type": "github" + }, + "original": { + "owner": "ivanovs-4", + "repo": "haskell-flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1727089097, + "narHash": "sha256-ZMHMThPsthhUREwDebXw7GX45bJnBCVbfnH1g5iuSPc=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "568bfef547c14ca438c56a0bece08b8bb2b71a9c", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "haskell-flake-utils": "haskell-flake-utils", + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 00000000..97657169 --- /dev/null +++ b/flake.nix @@ -0,0 +1,69 @@ +{ +description = "Haskell cabal package"; + +inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + haskell-flake-utils.url = "github:ivanovs-4/haskell-flake-utils"; + + # another-simple-haskell-flake.url = "something"; + + # some-cabal-pkg.url = "github:example/some-cabal-pkg"; + # some-cabal-pkg.flake = false; +}; + +outputs = { self, nixpkgs, haskell-flake-utils, ... }@inputs: + haskell-flake-utils.lib.simpleCabal2flake { + inherit self nixpkgs; + # systems = [ "x86_64-linux" ]; + + # DON'T FORGET TO PUT YOUR PACKAGE NAME HERE, REMOVING `throw` + name = "fuzzy-parse"; + + shellExtBuildInputs = {pkgs}: with pkgs; [ + haskellPackages.haskell-language-server + ]; + + # Wether to build hoogle in the default shell + shellWithHoogle = true; + + + ## Optional parameters follow + + # nixpkgs config + # config = { }; + + # Add another haskell flakes as requirements + # haskellFlakes = [ inputs.another-simple-haskell-flake ]; + + # Use this to load other flakes overlays to supplement nixpkgs + # preOverlays = [ ]; + + # Pass either a function or a file + # preOverlay = ./overlay.nix; + + # Override haskell packages + # hpPreOverrides = { pkgs }: new: old: + # with pkgs.haskell.lib; with haskell-flake-utils.lib; + # tunePackages pkgs old { + # some-haskellPackages-package = [ dontHaddock ]; + # } // { + # some-cabal-pkg = ((jailbreakUnbreak pkgs) (dontCheck (old.callCabal2nix "some-cabal-pkg" inputs.some-cabal-pkg {}))); + # }; + + # Arguments for callCabal2nix + # cabal2nixArgs = {pkgs}: { + # }; + + # Maps to the devShell output. Pass in a shell.nix file or function + # shell = ./shell.nix + + # Additional build intputs of the default shell + # shellExtBuildInputs = {pkgs}: with pkgs; [ + # haskellPackages.haskell-language-server + # ]; + + # Wether to build hoogle in the default shell + # shellWithHoogle = true; + + }; +} diff --git a/fuzzy-parse.cabal b/fuzzy-parse.cabal new file mode 100644 index 00000000..a7541580 --- /dev/null +++ b/fuzzy-parse.cabal @@ -0,0 +1,160 @@ +cabal-version: 3.0 + +name: fuzzy-parse +version: 0.1.3.1 +synopsis: Tools for processing unstructured text data + +description: + The lightweight and easy to use functions for text tokenizing and parsing. It aimed for + parsing mostly unstructured data, but the structured formats may be parsed as well. + + It may be used in different sutiations, for DSL, tex markups or even for parsing simple + grammars easier and sometimes faster than in case of usage mainstream parsing combinators + or parser generators. + + See the README.markdown, examples and modules documentation for more. + +license: MIT +license-file: LICENSE +author: Dmitry Zuikov +maintainer: dzuikov@gmail.com + +category: Text, Parsing +extra-source-files: CHANGELOG.md + +homepage: https://github.com/hexresearch/fuzzy-parse +bug-reports: https://github.com/hexresearch/fuzzy-parse/issues + +extra-source-files: + README.markdown + +common shared-properties + + default-language: GHC2021 + + default-extensions: + ApplicativeDo + , BangPatterns + , BlockArguments + , ConstraintKinds + , DataKinds + , DeriveDataTypeable + , DeriveGeneric + , DerivingStrategies + , DerivingVia + , ExtendedDefaultRules + , FlexibleContexts + , FlexibleInstances + , GADTs + , GeneralizedNewtypeDeriving + , ImportQualifiedPost + , LambdaCase + , MultiParamTypeClasses + , OverloadedStrings + , QuasiQuotes + , RecordWildCards + , ScopedTypeVariables + , StandaloneDeriving + , TemplateHaskell + , TupleSections + , TypeApplications + , TypeFamilies + + + +library + + import: shared-properties + + ghc-options: + -Wall + -fno-warn-type-defaults + -O2 + "-with-rtsopts=-N4 -A64m -AL256m -I0" + + + exposed-modules: Data.Text.Fuzzy.Tokenize + , Data.Text.Fuzzy.Dates + , Data.Text.Fuzzy.Section + , Data.Text.Fuzzy.SExp + , Data.Text.Fuzzy.Attoparsec.Day + , Data.Text.Fuzzy.Attoparsec.Month + + build-depends: base + , attoparsec + , containers + , mtl + , prettyprinter + , safe + , streaming + , scientific + , text + , time + , microlens-platform + , uniplate + , unliftio + , unordered-containers + , timeit + + hs-source-dirs: src + + +executable fuzzy-sexp-parse + + import: shared-properties + default-language: GHC2021 + + ghc-options: + -Wall + -fno-warn-type-defaults + -O2 + + + main-is: FuzzySexpParse.hs + + hs-source-dirs: misc + + build-depends: base, fuzzy-parse + , containers + , hspec + , hspec-discover + , interpolatedstring-perl6 + , text + , mtl + , streaming + , transformers + , exceptions + , uniplate + , microlens-platform + , safe + , timeit + , prettyprinter + + +test-suite fuzzy-parse-test + import: shared-properties + default-language: GHC2021 + + + type: exitcode-stdio-1.0 + main-is: Spec.hs + other-modules: FuzzyParseSpec + hs-source-dirs: test + build-depends: base, fuzzy-parse + , containers + , hspec + , hspec-discover + , interpolatedstring-perl6 + , text + , mtl + , streaming + , transformers + , exceptions + , uniplate + , microlens-platform + , safe + , timeit + + build-tool-depends: hspec-discover:hspec-discover == 2.* + + diff --git a/misc/FuzzySexpParse.hs b/misc/FuzzySexpParse.hs new file mode 100644 index 00000000..e9a037fb --- /dev/null +++ b/misc/FuzzySexpParse.hs @@ -0,0 +1,30 @@ +module Main where + +import Data.Text.Fuzzy.SExp +import Data.Text.IO qualified as IO +import Data.Text qualified as Text +import Data.Either +import System.TimeIt +import Control.Monad.Except +import Data.Functor +import Data.Function +import Data.Fixed +import Prettyprinter +import System.IO + +main :: IO () +main = do + s <- IO.getContents + + (tt,toks) <- timeItT do + pure (tokenizeSexp s) + + (pt,top) <- timeItT do + runExceptT (parseTop @() s) <&> either (error.show) id + + print (vcat (fmap pretty top)) + + hPrint stderr $ pretty (Text.length s) <+> "chars, parsed in" <+> viaShow (realToFrac pt :: Fixed E6) + + + diff --git a/nix/derivations/.gitignore b/nix/derivations/.gitignore new file mode 100644 index 00000000..e69de29b diff --git a/nix/pkgs.json b/nix/pkgs.json new file mode 100644 index 00000000..5f553b6d --- /dev/null +++ b/nix/pkgs.json @@ -0,0 +1,7 @@ +{ + "url": "https://github.com/NixOS/nixpkgs.git", + "rev": "5cf0de2485efeccc307692eedadbb2d9bfdc7013", + "date": "2020-06-04T17:30:37+08:00", + "sha256": "07axrr50nlmnvba5ja2ihzjwczi66znak57bhcz472w22w7m3sd1", + "fetchSubmodules": false +} diff --git a/nix/pkgs.nix b/nix/pkgs.nix new file mode 100644 index 00000000..2ab3662c --- /dev/null +++ b/nix/pkgs.nix @@ -0,0 +1,5 @@ +import ((import {}).fetchFromGitHub { + owner = "NixOS"; + repo = "nixpkgs"; + inherit (builtins.fromJSON (builtins.readFile ./pkgs.json)) rev sha256; +}) diff --git a/nix/release.nix b/nix/release.nix new file mode 100644 index 00000000..f093ba4d --- /dev/null +++ b/nix/release.nix @@ -0,0 +1,29 @@ +let + pkgs = import ./pkgs.nix { inherit config; + }; + lib = pkgs.haskell.lib; + config = { + packageOverrides = pkgs: rec { + haskellPackages = pkgs.haskellPackages.override { overrides = haskOverrides; }; + }; + }; + gitignore = pkgs.callPackage (pkgs.fetchFromGitHub { + owner = "siers"; + repo = "nix-gitignore"; + rev = "ce0778ddd8b1f5f92d26480c21706b51b1af9166"; + sha256 = "1d7ab78i2k13lffskb23x8b5h24x7wkdmpvmria1v3wb9pcpkg2w"; + }) {}; + ignore = gitignore.gitignoreSourceAux '' + .stack-work + dist + dist-newstyle + .ghc.environment* + ''; + haskOverrides = new: old: + let overrides = lib.packagesFromDirectory { directory = ./derivations; } new old; + in overrides; +in rec { + inherit pkgs; + packages = { inherit (pkgs.haskellPackages) fuzzy-parse; + }; +} diff --git a/nix/update-nixpkgs.sh b/nix/update-nixpkgs.sh new file mode 100755 index 00000000..773d0b54 --- /dev/null +++ b/nix/update-nixpkgs.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env nix-shell +#! nix-shell -i bash -p nix-prefetch-git +nix-prefetch-git https://github.com/NixOS/nixpkgs.git | tee pkgs.json diff --git a/scripts/upload-docs.sh b/scripts/upload-docs.sh new file mode 100755 index 00000000..492fa288 --- /dev/null +++ b/scripts/upload-docs.sh @@ -0,0 +1,10 @@ +#!/bin/sh +set -e + +dir=$(mktemp -d dist-docs.XXXXXX) +trap 'rm -r "$dir"' EXIT + +# assumes cabal 2.4 or later +cabal v2-haddock --builddir="$dir" --haddock-for-hackage --enable-doc + +cabal upload -d --publish $dir/*-docs.tar.gz diff --git a/src/Data/Text/Fuzzy/Attoparsec/Day.hs b/src/Data/Text/Fuzzy/Attoparsec/Day.hs new file mode 100644 index 00000000..9058737a --- /dev/null +++ b/src/Data/Text/Fuzzy/Attoparsec/Day.hs @@ -0,0 +1,97 @@ +module Data.Text.Fuzzy.Attoparsec.Day ( dayDMY + , dayYMD + , dayYYYYMMDD + , dayDMonY + , day + ) where + +import Data.List (zipWith) +import Control.Applicative ((<|>)) +import Data.Attoparsec.Text (Parser,decimal,digit,count,satisfy,inClass,skipWhile) +import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength) +import qualified Data.Char as Char +import qualified Data.Text as Text + + + +day :: Parser Day +day = dayDMonY <|> dayYYYYMMDD <|> dayYMD <|> dayDMY + +skipDelim :: Parser () +skipDelim = skipWhile (inClass " ./-") + +dayDMY :: Parser Day +dayDMY = do + d <- decimal :: Parser Int + skipDelim + m <- decimal :: Parser Int + skipDelim + y' <- decimal :: Parser Integer + maybe (fail "bad date format") pure (makeDay y' m d) + +dayYMD :: Parser Day +dayYMD = do + y' <- decimal :: Parser Integer + skipDelim + m <- decimal :: Parser Int + skipDelim + d <- decimal :: Parser Int + maybe (fail "bad date format") pure (makeDay y' m d) + +dayYYYYMMDD :: Parser Day +dayYYYYMMDD = do + y <- fromIntegral . num n4 . map o <$> count 4 digit + m <- num n2 . map o <$> count 2 digit + d <- num n2 . map o <$> count 2 digit + maybe (fail "bad date format") pure (makeDay y m d) + where n4 = [1000,100,10,1] + n2 = [10,1] + o x = Char.ord x - Char.ord '0' + num n x = sum $ zipWith (*) x n + +dayDMonY :: Parser Day +dayDMonY = do + d <- decimal :: Parser Int + skipDelim + m <- pMon + skipDelim + y <- decimal :: Parser Integer + maybe (fail "bad date format") pure (makeDay y m d) + where + pMon :: Parser Int + pMon = do + txt <- Text.toUpper . Text.pack <$> count 3 (satisfy Char.isLetter) + case txt of + "JAN" -> pure 1 + "FEB" -> pure 2 + "MAR" -> pure 3 + "APR" -> pure 4 + "MAY" -> pure 5 + "JUN" -> pure 6 + "JUL" -> pure 7 + "AUG" -> pure 8 + "SEP" -> pure 9 + "OCT" -> pure 10 + "NOV" -> pure 11 + "DEC" -> pure 12 + _ -> fail "bad month name" + + +makeYear :: Integer -> Maybe Integer +makeYear y' = if y < 1900 && y' < 99 + then Nothing + else pure y + where + y = if y' < 50 + then y' + 2000 + else (if y' >= 50 && y' <= 99 + then y' + 1900 + else y' ) + +makeDay :: Integer -> Int -> Int -> Maybe Day +makeDay y m d | m <= 12 && m > 0 = + makeYear y >>= \yyyy -> if d <= gregorianMonthLength yyyy m + then pure $ fromGregorian yyyy m d + else Nothing + + | otherwise = Nothing diff --git a/src/Data/Text/Fuzzy/Attoparsec/Month.hs b/src/Data/Text/Fuzzy/Attoparsec/Month.hs new file mode 100644 index 00000000..42596a13 --- /dev/null +++ b/src/Data/Text/Fuzzy/Attoparsec/Month.hs @@ -0,0 +1,46 @@ +module Data.Text.Fuzzy.Attoparsec.Month ( fuzzyMonth, fuzzyMonthFromText + ) where + +import Control.Applicative ((<|>)) +import Data.Attoparsec.Text (Parser,decimal,digit,letter,many1,parseOnly) +import Data.Map (Map) +import Data.Maybe +import Data.Text (Text) +import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength) +import qualified Data.Char as Char +import qualified Data.Map as Map +import qualified Data.Text as Text + + +fuzzyMonth :: Parser Int +fuzzyMonth = pMonthNum <|> pMonth + +fuzzyMonthFromText :: Text -> Maybe Int +fuzzyMonthFromText = either (const Nothing) Just . parseOnly fuzzyMonth + +pMonthNum :: Parser Int +pMonthNum = do + n <- decimal + if n >= 1 && n <= 13 + then pure n + else fail "invalid months num" + +pMonth :: Parser Int +pMonth = do + mo <- many1 (Char.toLower <$> letter) + maybe (fail "invalid month name") pure (Map.lookup mo months) + where + months :: Map String Int + months = Map.fromList [ ("jan", 1), ("january" , 1) + , ("feb", 2), ("febuary" , 2) + , ("mar", 3), ("march" , 3) + , ("apr", 4), ("april" , 4) + , ("may", 5), ("may" , 5) + , ("jun", 6), ("june" , 6) + , ("jul", 7), ("july" , 7) + , ("aug", 8), ("august" , 8) + , ("sep", 9), ("september", 9) + , ("oct", 10), ("october" , 10) + , ("nov", 11), ("november" , 11) + , ("dec", 12), ("december" , 12) + ] diff --git a/src/Data/Text/Fuzzy/Dates.hs b/src/Data/Text/Fuzzy/Dates.hs new file mode 100644 index 00000000..9aa62c80 --- /dev/null +++ b/src/Data/Text/Fuzzy/Dates.hs @@ -0,0 +1,46 @@ +-- | +-- Module : Data.Text.Fuzzy.Dates +-- Copyright : Dmitry Zuikov 2020 +-- License : MIT +-- +-- Maintainer : dzuikov@gmail.com +-- Stability : experimental +-- Portability : unknown +-- +-- Dates fuzzy parsing. +-- Supports a number of dates format and tries to recover +-- the incomplete dates from text, with use of some +-- reasonable assumptions. Does not support locales, +-- i.e assums only English for dates yet. +-- +-- == Examples +-- +-- > parseMaybeDay "01.01.1979" +-- > Just 1979-01-01 +-- > parseMaybeDay "01.01.01" +-- > Just 2001-01-01 +-- > parseMaybeDay "13/01/2019" +-- > Just 2019-01-13 +-- > parseMaybeDay "2019-12-1" +-- > Just 2019-12-01 +-- > parseMaybeDay "21-feb-79" +-- > Just 1979-02-21 +-- > parseMaybeDay "21-feb-01" +-- > Just 2001-02-21 +-- > parseMaybeDay "29feb04" +-- > Just 2004-02-29 +-- > parseMaybeDay "21feb28" +-- > Just 2028-02-21 + +module Data.Text.Fuzzy.Dates where + +import Data.Attoparsec.Text (parseOnly) +import Data.Either (either) +import Data.Text.Fuzzy.Attoparsec.Day +import Data.Text (Text) +import Data.Time.Calendar + +-- | Tries to parse a date from the text. +parseMaybeDay :: Text -> Maybe Day +parseMaybeDay s = either (const Nothing) pure (parseOnly day s) + diff --git a/src/Data/Text/Fuzzy/SExp.hs b/src/Data/Text/Fuzzy/SExp.hs new file mode 100644 index 00000000..c6e21dba --- /dev/null +++ b/src/Data/Text/Fuzzy/SExp.hs @@ -0,0 +1,378 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE ExtendedDefaultRules #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE ImportQualifiedPost #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE TemplateHaskell #-} +module Data.Text.Fuzzy.SExp where + +import Data.Text (Text) + +import Control.Applicative +import Control.Monad +import Data.Function +import Data.Functor +import Data.Text.Fuzzy.Tokenize +import Control.Monad.Reader +import Data.Typeable +import Control.Monad.Except +import Control.Monad.RWS +import Data.Maybe +import Data.Char (isSpace,digitToInt) +import Data.Generics.Uniplate.Data() +import Safe +import Data.Data +import GHC.Generics +import Lens.Micro.Platform +import Data.Text qualified as Text +import Data.Coerce +import Data.Scientific + +import Data.HashMap.Strict (HashMap) +import Data.HashMap.Strict qualified as HM + +import Prettyprinter hiding (braces,list) + + +import Streaming.Prelude qualified as S + +data TTok = TChar Char + | TSChar Char + | TPunct Char + | TText Text + | TStrLit Text + | TKeyword Text + | TEmpty + | TIndent Int + deriving stock (Eq,Ord,Show,Data,Generic) + +instance IsToken TTok where + mkChar = TChar + mkSChar = TSChar + mkPunct = TPunct + mkText = TText + mkStrLit = TStrLit + mkKeyword = TKeyword + mkEmpty = TEmpty + mkIndent = TIndent + +newtype C0 = C0 (Maybe Int) + deriving stock (Eq,Ord,Show,Data,Typeable,Generic) + +data SExpParseError = + ParensOver C0 + | ParensUnder C0 + | ParensUnmatched C0 + | SyntaxError C0 + deriving stock (Show,Typeable) + + +data NumType = + NumInteger Integer + | NumDouble Scientific + deriving stock (Eq,Ord,Show,Data,Generic) + +class Monoid c => ForMicroSexp c where + +instance Monoid C0 where + mempty = C0 Nothing + +instance Semigroup C0 where + (<>) (C0 a) (C0 b) = C0 (b <|> a) + +instance ForMicroSexp C0 where + + +instance ForMicroSexp () where + +data MicroSexp c = + List_ c [MicroSexp c] + | Symbol_ c Text + | String_ c Text + | Number_ c NumType + | Boolean_ c Bool + deriving stock (Show,Data,Generic) + +pattern List :: ForMicroSexp c => [MicroSexp c] -> MicroSexp c +pattern List xs <- List_ _ xs where + List xs = List_ mempty xs + +pattern Symbol :: ForMicroSexp c => Text -> MicroSexp c +pattern Symbol xs <- Symbol_ _ xs where + Symbol xs = Symbol_ mempty xs + +pattern String :: ForMicroSexp c => Text -> MicroSexp c +pattern String x <- String_ _ x where + String x = String_ mempty x + +pattern Number :: ForMicroSexp c => NumType -> MicroSexp c +pattern Number n <- Number_ _ n where + Number n = Number_ mempty n + +pattern Boolean :: ForMicroSexp c => Bool -> MicroSexp c +pattern Boolean b <- Boolean_ _ b where + Boolean b = Boolean_ mempty b + +{-# COMPLETE List, Symbol, String, Number, Boolean #-} + + +contextOf :: Lens (MicroSexp c) (MicroSexp c) c c +contextOf = lens g s + where + s sexp c = case sexp of + List_ _ a -> List_ c a + Symbol_ _ a -> Symbol_ c a + String_ _ a -> String_ c a + Number_ _ a -> Number_ c a + Boolean_ _ a -> Boolean_ c a + + g = \case + List_ c _ -> c + Symbol_ c _ -> c + String_ c _ -> c + Number_ c _ -> c + Boolean_ c _ -> c + +nil :: forall c . ForMicroSexp c => MicroSexp c +nil = List [] + +symbol :: forall c . ForMicroSexp c => Text -> MicroSexp c +symbol = Symbol + +str :: forall c . ForMicroSexp c => Text -> MicroSexp c +str = String + +newtype SExpEnv = + SExpEnv + { sexpTranslate :: Bool + } + +data SExpState = + SExpState + { _sexpLno :: Int + , _sexpBraces :: [Char] + } + +makeLenses 'SExpState + +defEnv :: SExpEnv +defEnv = SExpEnv True + +newtype SExpM m a = SExpM { fromSexpM :: RWST SExpEnv () SExpState m a } + deriving newtype + ( Applicative + , Functor + , Monad + , MonadState SExpState + , MonadReader SExpEnv + , MonadTrans + ) + + +instance MonadError SExpParseError m => MonadError SExpParseError (SExpM m) where + throwError = lift . throwError + catchError w = catchError (coerce $ fromSexpM w) + +tokenizeSexp :: Text -> [TTok] +tokenizeSexp txt = do + let spec = delims " \r\t" <> comment ";" + <> punct "'{}()[]\n" + <> sqq + <> uw + tokenize spec txt + +runSexpM :: Monad m => SExpM m a -> m a +runSexpM f = evalRWST (fromSexpM f) defEnv (SExpState 0 []) <&> fst + + +parseSexp :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m (MicroSexp c) +parseSexp txt = do + (s, _) <- runSexpM do + (s,rest) <- sexp (tokenizeSexp txt) + checkBraces + pure (s,rest) + + pure s + +checkBraces :: (MonadError SExpParseError m) => SExpM m () +checkBraces = do + braces <- gets (view sexpBraces) + unless (null braces) $ raiseWith ParensUnder + +succLno :: (MonadError SExpParseError m) => SExpM m () +succLno = modify (over sexpLno succ) + +parseTop :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m [MicroSexp c] +parseTop txt = do + let tokens = tokenizeSexp txt + S.toList_ $ runSexpM do + flip fix (mempty,tokens) $ \next -> \case + (acc, []) -> do + emit acc + (acc, TPunct '\n' : rest) -> do + succLno + emit acc + next (mempty,rest) + (acc, rest) -> do + (s, xs) <- sexp rest + next (acc <> [s],xs) + + where + + emit [] = pure () + emit wtf = case wtf of + [List one] -> lift $ S.yield (List one) + xs -> lift $ S.yield (List xs) + +sexp :: (ForMicroSexp c, MonadError SExpParseError m) => [TTok] -> SExpM m (MicroSexp c, [TTok]) +sexp s = case s of + [] -> do + checkBraces + pure (nil, mempty) + + (TText l : w) -> (,w) <$> trNum (Symbol l) + + (TStrLit l : w) -> pure (String l, w) + + -- so far ignored + (TPunct '\'' : rest) -> sexp rest + + (TPunct '\n' : rest) -> succLno >> sexp rest + + (TPunct c : rest) | isSpace c -> sexp rest + + (TPunct c : rest) | isBrace c -> + maybe (pure (nil, rest)) (`list` rest) (closing c) + | otherwise -> do + raiseWith ParensOver + + ( _ : _ ) -> raiseWith SyntaxError + + where + + setContext w = do + co <- getC0 + pure $ over _2 (set contextOf co) w + + isBrace :: Char -> Bool + isBrace c = HM.member c braces + + closing :: Char -> Maybe Char + closing c = HM.lookup c braces + + braces :: HashMap Char Char + braces = HM.fromList[ ('{', '}') + , ('(', ')') + , ('[', ']') + , ('<', '>') + ] + + cBraces :: [Char] + cBraces = HM.elems braces + + trNum tok = do + + trans <- asks sexpTranslate + + case tok of + Symbol s | trans -> do + let s0 = Text.unpack s + + let what = Number . NumInteger <$> readMay @Integer s0 + <|> + Number . NumInteger <$> parseBinary s0 + <|> + Number . NumDouble <$> readMay @Scientific s0 + <|> + ( case s of + "#t" -> Just (Boolean True) + "#f" -> Just (Boolean False) + _ -> Nothing + ) + + pure $ fromMaybe (Symbol s) what + + + x -> pure x + {-# INLINE trNum #-} + + list :: (ForMicroSexp c, MonadError SExpParseError m) + => Char + -> [TTok] + -> SExpM m (MicroSexp c, [TTok]) + + list _ [] = raiseWith ParensUnder + + list cb tokens = do + modify $ over sexpBraces (cb:) + + go cb mempty tokens + + where + + isClosingFor :: Char -> Bool + isClosingFor c = c `elem` cBraces + + go _ _ [] = do + checkBraces + pure (List mempty, mempty) + + go cl acc (TPunct c : rest) | isSpace c = do + go cl acc rest + + go cl acc (TPunct c : rest) + | isClosingFor c && c == cl = do + modify $ over sexpBraces (drop 1) + pure (List (reverse acc), rest) + + | isClosingFor c && c /= cl = do + raiseWith ParensUnmatched + -- throwError =<< ParensUnmatched <$> undefined + + go cl acc rest = do + (e,r) <- sexp rest + go cl (e : acc) r + + +getC0 :: Monad m => SExpM m C0 +getC0 = do + lno <- gets (view sexpLno) + pure (C0 (Just lno)) + +raiseWith :: (MonadError SExpParseError m) + => (C0 -> SExpParseError) -> SExpM m b + +raiseWith a = throwError =<< a <$> getC0 + +instance Pretty NumType where + pretty = \case + NumInteger n -> pretty n + NumDouble n -> viaShow n + +instance ForMicroSexp c => Pretty (MicroSexp c) where + + pretty = \case + List xs -> parens (hsep (fmap pretty xs)) + String s -> dquotes (pretty s) + Symbol s -> pretty s + Number n -> pretty n + Boolean True -> pretty "#t" + Boolean False -> pretty "#f" + +isBinaryDigit :: Char -> Bool +isBinaryDigit c = c == '0' || c == '1' + +parseBinary :: String -> Maybe Integer +parseBinary str = + let + withoutPrefix = case str of + '0':'b':rest -> Just rest + '0':'B':rest -> Just rest + _ -> Nothing + in if isJust withoutPrefix && all isBinaryDigit (fromJust withoutPrefix) + then Just $ foldl (\acc x -> acc * 2 + toInteger (digitToInt x)) 0 (fromJust withoutPrefix) + else Nothing + diff --git a/src/Data/Text/Fuzzy/Section.hs b/src/Data/Text/Fuzzy/Section.hs new file mode 100644 index 00000000..c7e8166d --- /dev/null +++ b/src/Data/Text/Fuzzy/Section.hs @@ -0,0 +1,15 @@ +module Data.Text.Fuzzy.Section (cutSectionBy, cutSectionOn) where + +import Data.Text (Text) +import qualified Data.List as List + + +cutSectionOn :: Text -> Text -> [Text] -> [Text] +cutSectionOn a b txt = cutSectionBy ((==)a) ((==b)) txt + +cutSectionBy :: (Text -> Bool) -> (Text -> Bool) -> [Text] -> [Text] +cutSectionBy a b txt = cutI + where + cutC = List.dropWhile (not . a) txt + cutI = List.takeWhile (not . b) cutC + diff --git a/src/Data/Text/Fuzzy/Tokenize.hs b/src/Data/Text/Fuzzy/Tokenize.hs new file mode 100644 index 00000000..a8bfa6d1 --- /dev/null +++ b/src/Data/Text/Fuzzy/Tokenize.hs @@ -0,0 +1,556 @@ +-- | +-- Module : Data.Text.Fuzzy.Tokenize +-- Copyright : Dmitry Zuikov 2020 +-- License : MIT +-- +-- Maintainer : dzuikov@gmail.com +-- Stability : experimental +-- Portability : unknown +-- +-- The lightweight and multi-functional text tokenizer allowing different types of text tokenization +-- depending on it's settings. +-- +-- It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars +-- easier and sometimes faster than in case of usage mainstream parsing combinators or parser +-- generators. +-- +-- The primary goal of this package is to parse unstructured text data, however it may be used for +-- parsing such data formats as CSV with ease. +-- +-- Currently it supports the following types of entities: atoms, string literals (currently with the +-- minimal set of escaped characters), punctuation characters and delimeters. +-- +-- == Examples +-- === Simple CSV-like tokenization +-- >>> tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text] +-- ["aaa "," bebeb "," qqq "] +-- +-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text] +-- ["aaa "," bebeb "," qqq ","","","",""] +-- +-- >>>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text] +-- [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] +-- +-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] +-- [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] +-- +-- >>> let spec = sl<>delims ":"<>sq<>emptyFields<>noslits +-- >>> tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] +-- [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing] +-- +-- >>> let spec = delims ":"<>sq<>emptyFields<>uw<>noslits +-- >>> tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] +-- [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing] +-- +-- == Notes +-- +-- === About the delimeter tokens +-- This type of tokens appears during a "delimited" +-- formats processing and disappears in results. Currenly +-- you will never see it unless normalization is turned off by 'nn' option. +-- +-- The delimeters make sense in case of processing the CSV-like formats, +-- but in this case you probably need only values in results. +-- +-- This behavior may be changed later. But right now delimeters seem pointless +-- in results. If you process some sort of grammar where delimeter character +-- is important, you may use punctuation instead, i.e: +-- +-- >>> let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq +-- >>> tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text] +-- ["(","delimeters",",","are",",","important",",","spaces are not",")",";"] +-- +-- == Other +-- For CSV-like formats it makes sense to split text to lines first, +-- otherwise newline characters may cause to weird results +-- +-- + +module Data.Text.Fuzzy.Tokenize ( TokenizeSpec + , IsToken(..) + , tokenize + , esc + , addEmptyFields + , emptyFields + , nn + , sq + , sqq + , noslits + , sl + , sr + , uw + , delims + , comment + , punct + , indent + , itabstops + , keywords + , eol + ) where + +import Prelude hiding (init) + +import Control.Applicative +import Data.Map (Map) +import Data.Maybe (fromMaybe) +import Data.Monoid() +import Data.Set (Set) +import Data.Text (Text) +import qualified Data.List as List +import qualified Data.Map as Map +import qualified Data.Set as Set +import qualified Data.Text as Text + +import Control.Monad (when) +import Control.Monad.RWS + +-- | Tokenization settings. Use mempty for an empty value +-- and construction functions for changing the settings. +-- +data TokenizeSpec = TokenizeSpec { tsAtoms :: Set Text + , tsStringQQ :: Maybe Bool + , tsStringQ :: Maybe Bool + , tsNoSlits :: Maybe Bool + , tsLineComment :: Map Char Text + , tsDelims :: Set Char + , tsEol :: Maybe Bool + , tsStripLeft :: Maybe Bool + , tsStripRight :: Maybe Bool + , tsUW :: Maybe Bool + , tsNotNormalize :: Maybe Bool + , tsEsc :: Maybe Bool + , tsAddEmptyFields :: Maybe Bool + , tsPunct :: Set Char + , tsIndent :: Maybe Bool + , tsItabStops :: Maybe Int + , tsKeywords :: Set Text + } + deriving (Eq,Ord,Show) + + +instance Semigroup TokenizeSpec where + (<>) a b = TokenizeSpec { tsAtoms = tsAtoms b <> tsAtoms a + , tsStringQQ = tsStringQQ b <|> tsStringQQ a + , tsStringQ = tsStringQ b <|> tsStringQ a + , tsNoSlits = tsNoSlits b <|> tsNoSlits a + , tsLineComment = tsLineComment b <> tsLineComment a + , tsDelims = tsDelims b <> tsDelims a + , tsEol = tsEol b <|> tsEol a + , tsStripLeft = tsStripLeft b <|> tsStripLeft a + , tsStripRight = tsStripRight b <|> tsStripRight a + , tsUW = tsUW b <|> tsUW a + , tsNotNormalize = tsNotNormalize b <|> tsNotNormalize a + , tsEsc = tsEsc b <|> tsEsc a + , tsAddEmptyFields = tsAddEmptyFields b <|> tsAddEmptyFields a + , tsPunct = tsPunct b <> tsPunct a + , tsIndent = tsIndent b <|> tsIndent a + , tsItabStops = tsItabStops b <|> tsItabStops a + , tsKeywords = tsKeywords b <> tsKeywords a + } + +instance Monoid TokenizeSpec where + mempty = TokenizeSpec { tsAtoms = mempty + , tsStringQQ = Nothing + , tsStringQ = Nothing + , tsNoSlits = Nothing + , tsLineComment = mempty + , tsDelims = mempty + , tsEol = Nothing + , tsStripLeft = Nothing + , tsStripRight = Nothing + , tsUW = Nothing + , tsNotNormalize = Nothing + , tsEsc = Nothing + , tsAddEmptyFields = Nothing + , tsPunct = mempty + , tsIndent = Nothing + , tsItabStops = Nothing + , tsKeywords = mempty + } + + +justTrue :: Maybe Bool -> Bool +justTrue (Just True) = True +justTrue _ = False + +-- | Turns on EOL token generation +eol :: TokenizeSpec +eol = mempty { tsEol = pure True } + +-- | Turn on character escaping inside string literals. +-- Currently the following escaped characters are +-- supported: [" ' \ t n r a b f v ] +esc :: TokenizeSpec +esc = mempty { tsEsc = pure True } + +-- | Raise empty field tokens (note mkEmpty method) +-- when no tokens found before a delimeter. +-- Useful for processing CSV-like data in +-- order to distingush empty columns +addEmptyFields :: TokenizeSpec +addEmptyFields = mempty { tsAddEmptyFields = pure True } + +-- | same as addEmptyFields +emptyFields :: TokenizeSpec +emptyFields = addEmptyFields + +-- | Turns off token normalization. Makes the tokenizer +-- generate character stream. Useful for debugging. +nn :: TokenizeSpec +nn = mempty { tsNotNormalize = pure True } + +-- | Turns on single-quoted string literals. +-- Character stream after '\'' character +-- will be proceesed as single-quoted stream, +-- assuming all delimeter, comment and other special +-- characters as a part of the string literal until +-- the next unescaped single quote character. +sq :: TokenizeSpec +sq = mempty { tsStringQ = pure True } + +-- | Enable double-quoted string literals support +-- as 'sq' for single-quoted strings. +sqq :: TokenizeSpec +sqq = mempty { tsStringQQ = pure True } + +-- | Disable separate string literals. +-- +-- Useful when processed delimeted data (csv-like formats). +-- Normally, sequential text chunks are concatenated together, +-- but consequent text and string literal will produce the two +-- different tokens and it may cause weird results if data +-- is in csv-like format, i.e: +-- +-- >>> tokenize (delims ":"<>emptyFields<>sq ) "aaa:bebe:'qq' aaa:next::" :: [Maybe Text] +-- [Just "aaa",Just "bebe",Just "qq",Just " aaa",Just "next",Nothing,Nothing] +-- +-- look: "qq" and " aaa" are turned into two separate tokens that makes the result +-- of CSV processing looks improper, like it has an extra-column. This behavior may be +-- avoided using this option, if you don't need to distinguish text chunks and string +-- literals: +-- +-- >>> tokenize (delims ":"<>emptyFields<>sq<>noslits) "aaa:bebe:'qq:foo' aaa:next::" :: [Maybe Text] +-- [Just "aaa",Just "bebe",Just "qq:foo aaa",Just "next",Nothing,Nothing] +-- +noslits :: TokenizeSpec +noslits = mempty { tsNoSlits = pure True } + +-- | Specify the list of delimers (characters) +-- to split the character stream into fields. Useful for CSV-like separated formats. Support for +-- empty fields in token stream may be enabled by 'addEmptyFields' function +delims :: String -> TokenizeSpec +delims s = mempty { tsDelims = Set.fromList s } + +-- | Strip spaces on left side of a token. +-- Does not affect string literals, i.e string are processed normally. Useful mostly for +-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces. +sl :: TokenizeSpec +sl = mempty { tsStripLeft = pure True } + +-- | Strip spaces on right side of a token. +-- Does not affect string literals, i.e string are processed normally. Useful mostly for +-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces. +sr :: TokenizeSpec +sr = mempty { tsStripRight = pure True } + +-- | Strips spaces on right and left sides and transforms multiple spaces into the one. +-- Name origins from unwords . words +-- +-- Does not affect string literals, i.e string are processed normally. Useful mostly for +-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces. +uw :: TokenizeSpec +uw = mempty { tsUW = pure True } + +-- | Specify the line comment prefix. +-- All text after the line comment prefix will +-- be ignored until the newline character appearance. +-- Multiple line comments are supported. +comment :: Text -> TokenizeSpec +comment s = mempty { tsLineComment = cmt } + where + cmt = case Text.uncons s of + Just (p,su) -> Map.singleton p su + Nothing -> mempty + +-- | Specify the punctuation characters. +-- Any punctuation character is handled as a separate +-- token. +-- Any token will be breaked on a punctiation character. +-- +-- Useful for handling ... er... punctuaton, like +-- +-- >> function(a,b) +-- +-- or +-- +-- >> (apply function 1 2 3) +-- +-- +-- >>> tokenize spec "(apply function 1 2 3)" :: [Text] +-- ["(","apply","function","1","2","3",")"] +-- +punct :: Text -> TokenizeSpec +punct s = mempty { tsPunct = Set.fromList (Text.unpack s) } + +-- | Specify the keywords list. +-- Each keyword will be threated as a separate token. +keywords :: [Text] -> TokenizeSpec +keywords s = mempty { tsKeywords = Set.fromList s } + +-- | Enable identation support +indent :: TokenizeSpec +indent = mempty { tsIndent = Just True } + +-- | Set tab expanding multiplier +-- i.e. each tab extends into n spaces before processing. +-- It also turns on the indentation. Only the tabs at the beginning of the string are expanded, +-- i.e. before the first non-space character appears. +itabstops :: Int -> TokenizeSpec +itabstops n = mempty { tsIndent = Just True, tsItabStops = pure n } + +newtype TokenizeM w a = TokenizeM (RWS TokenizeSpec w () a) + deriving( Applicative + , Functor + , MonadReader TokenizeSpec + , MonadWriter w + , MonadState () + , Monad + ) + +data Token = TChar Char + | TSChar Char + | TPunct Char + | TText Text + | TSLit Text + | TKeyword Text + | TEmpty + | TDelim + | TIndent Int + | TEol + deriving (Eq,Ord,Show) + +-- | Typeclass for token values. +-- Note, that some tokens appear in results +-- only when 'nn' option is set, i.e. sequences +-- of characters turn out to text tokens or string literals +-- and delimeter tokens are just removed from the +-- results +class IsToken a where + -- | Create a character token + mkChar :: Char -> a + -- | Create a string literal character token + mkSChar :: Char -> a + -- | Create a punctuation token + mkPunct :: Char -> a + -- | Create a text chunk token + mkText :: Text -> a + -- | Create a string literal token + mkStrLit :: Text -> a + -- | Create a keyword token + mkKeyword :: Text -> a + -- | Create an empty field token + mkEmpty :: a + -- | Create a delimeter token + mkDelim :: a + mkDelim = mkEmpty + + -- | Creates an indent token + mkIndent :: Int -> a + mkIndent = const mkEmpty + + -- | Creates an EOL token + mkEol :: a + mkEol = mkEmpty + +instance IsToken (Maybe Text) where + mkChar = pure . Text.singleton + mkSChar = pure . Text.singleton + mkPunct = pure . Text.singleton + mkText = pure + mkStrLit = pure + mkKeyword = pure + mkEmpty = Nothing + +instance IsToken Text where + mkChar = Text.singleton + mkSChar = Text.singleton + mkPunct = Text.singleton + mkText = id + mkStrLit = id + mkKeyword = id + mkEmpty = "" + +-- | Tokenize a text +tokenize :: IsToken a => TokenizeSpec -> Text -> [a] +tokenize s t = map tr t1 + where + t1 = tokenize' s t + tr (TChar c) = mkChar c + tr (TSChar c) = mkSChar c + tr (TText c) = mkText c + tr (TSLit c) = mkStrLit c + tr (TKeyword c) = mkKeyword c + tr TEmpty = mkEmpty + tr (TPunct c) = mkPunct c + tr TDelim = mkDelim + tr (TIndent n) = mkIndent n + tr TEol = mkEol + +execTokenizeM :: TokenizeM [Token] a -> TokenizeSpec -> [Token] +execTokenizeM (TokenizeM m) spec = + let (_,w) = execRWS m spec () in norm w + + where norm x | justTrue (tsNotNormalize spec) = x + | otherwise = normalize spec x + +tokenize' :: TokenizeSpec -> Text -> [Token] +tokenize' spec txt = execTokenizeM (root' txt) spec + where + + r = spec + + noIndent = not doIndent + doIndent = justTrue (tsIndent r) + eolOk = justTrue (tsEol r) + + root' x = scanIndent x >>= root + + root ts = do + + case Text.uncons ts of + Nothing -> pure () + + Just ('\n', rest) | doIndent -> raiseEol >> root' rest + Just (c, rest) | Set.member c (tsDelims r) -> tell [TDelim] >> root rest + Just ('\'', rest) | justTrue (tsStringQ r) -> scanQ '\'' rest + Just ('"', rest) | justTrue (tsStringQQ r) -> scanQ '"' rest + + Just (c, rest) | Map.member c (tsLineComment r) -> scanComment (c,rest) + + Just (c, rest) | Set.member c (tsPunct r) -> tell [TPunct c] >> root rest + + Just (c, rest) | otherwise -> tell [TChar c] >> root rest + + + raiseEol | eolOk = tell [TEol] + | otherwise = pure () + + expandSpace ' ' = 1 + expandSpace '\t' = (fromMaybe 8 (tsItabStops r)) + expandSpace _ = 0 + + scanIndent x | noIndent = pure x + | otherwise = do + let (ss,as) = Text.span (\c -> c == ' ' || c == '\t') x + tell [ TIndent (sum (map expandSpace (Text.unpack ss))) ] + pure as + + scanComment (c,rest) = do + suff <- Map.lookup c <$> asks tsLineComment + case suff of + Just t | Text.isPrefixOf t rest -> do + root $ Text.dropWhile ('\n' /=) rest + + _ -> tell [TChar c] >> root rest + + scanQ q ts = do + + case Text.uncons ts of + Nothing -> root ts + + Just ('\\', rest) | justTrue (tsEsc r) -> unesc (scanQ q) rest + | otherwise -> tell [tsChar '\\'] >> scanQ q rest + + Just (c, rest) | c == q -> root rest + | otherwise -> tell [tsChar c] >> scanQ q rest + + unesc f ts = + case Text.uncons ts of + Nothing -> f ts + Just ('"', rs) -> tell [tsChar '"' ] >> f rs + Just ('\'', rs) -> tell [tsChar '\''] >> f rs + Just ('\\', rs) -> tell [tsChar '\\'] >> f rs + Just ('t', rs) -> tell [tsChar '\t'] >> f rs + Just ('n', rs) -> tell [tsChar '\n'] >> f rs + Just ('r', rs) -> tell [tsChar '\r'] >> f rs + Just ('a', rs) -> tell [tsChar '\a'] >> f rs + Just ('b', rs) -> tell [tsChar '\b'] >> f rs + Just ('f', rs) -> tell [tsChar '\f'] >> f rs + Just ('v', rs) -> tell [tsChar '\v'] >> f rs + Just (_, rs) -> f rs + + tsChar c | justTrue (tsNoSlits spec) = TChar c + | otherwise = TSChar c + +newtype NormStats = NormStats { nstatBeforeDelim :: Int } + +normalize :: TokenizeSpec -> [Token] -> [Token] +normalize spec tokens = snd $ execRWS (go tokens) () init + where + + go [] = addEmptyField + + go s@(TIndent _ : _) = do + let (iis, rest') = List.span isIndent s + tell [TIndent (sum [k | TIndent k <- iis])] + go rest' + + go (TChar c0 : cs) = do + let (n,ns) = List.span isTChar cs + succStat + let chunk = eatSpaces $ Text.pack (c0 : [ c | TChar c <- n]) + let kw = Set.member chunk (tsKeywords spec) + tell [ if kw then TKeyword chunk else TText chunk ] + go ns + + go (TSChar x : xs) = do + let (n,ns) = List.span isTSChar xs + succStat + tell [ TSLit $ Text.pack (x : [ c | TSChar c <- n]) ] + go ns + + go (TDelim : xs) = do + addEmptyField + pruneStat + go xs + + go (TPunct c : xs) = do + tell [ TPunct c ] + succStat + go xs + + go (x:xs) = tell [x] >> go xs + + succStat = do + modify (\x -> x { nstatBeforeDelim = succ (nstatBeforeDelim x)}) + + pruneStat = do + modify (\x -> x { nstatBeforeDelim = 0 } ) + + addEmptyField = do + ns <- gets nstatBeforeDelim + when (ns == 0 && justTrue (tsAddEmptyFields spec) ) $ do + tell [ TEmpty ] + + isTChar (TChar _) = True + isTChar _ = False + + isTSChar (TSChar _) = True + isTSChar _ = False + + isIndent (TIndent _) = True + isIndent _ = False + + init = NormStats { nstatBeforeDelim = 0 } + + eatSpaces s | sboth = Text.strip s + | sLonly = Text.stripStart s + | sRonly = Text.stripEnd s + | sWU = (Text.unwords . Text.words) s + | otherwise = s + + where sboth = justTrue (tsStripLeft spec) && justTrue (tsStripRight spec) + sLonly = justTrue (tsStripLeft spec) && not (justTrue (tsStripRight spec)) + sRonly = not (justTrue (tsStripLeft spec)) && justTrue (tsStripRight spec) + sWU = justTrue (tsUW spec) + diff --git a/src/Data/Text/LSH.hs b/src/Data/Text/LSH.hs new file mode 100644 index 00000000..10ccfc2d --- /dev/null +++ b/src/Data/Text/LSH.hs @@ -0,0 +1 @@ +module Data.Text.LSH where \ No newline at end of file diff --git a/test/FuzzyParseSpec.hs b/test/FuzzyParseSpec.hs new file mode 100644 index 00000000..bd868343 --- /dev/null +++ b/test/FuzzyParseSpec.hs @@ -0,0 +1,150 @@ +{-# LANGUAGE OverloadedStrings + , QuasiQuotes + , ExtendedDefaultRules + , LambdaCase + , ImportQualifiedPost + , DerivingStrategies + , PatternSynonyms + , ViewPatterns + , MultiWayIf + , TemplateHaskell +#-} + +module FuzzyParseSpec (spec) where + +import Data.Text (Text) +import Test.Hspec +import Text.InterpolatedString.Perl6 (q) + +import Data.Text.Fuzzy.Tokenize +import Data.Data +import Data.Generics.Uniplate.Data() +import GHC.Generics + +data TTok = TChar Char + | TSChar Char + | TPunct Char + | TText Text + | TStrLit Text + | TKeyword Text + | TEmpty + | TIndent Int + deriving stock (Eq,Ord,Show,Data,Generic) + +instance IsToken TTok where + mkChar = TChar + mkSChar = TSChar + mkPunct = TPunct + mkText = TText + mkStrLit = TStrLit + mkKeyword = TKeyword + mkEmpty = TEmpty + mkIndent = TIndent + + + +spec :: Spec +spec = do + describe "csv-like" $ do + it "splits text using ':' delimeter" $ do + let toks = tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text] + toks `shouldBe` ["aaa "," bebeb "," qqq "] + + it "splits text using ':' delimeter with single-quotes string and empty fields" $ do + let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text] + toks `shouldBe` ["aaa "," bebeb "," qqq ","","","",""] + + it "splits text using ':' delimeter with single-quotes string and empty fields" $ do + let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + toks `shouldBe` [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing] + + it "splits text using ':' delimeter with single-quotes string and empty fields with noslits" $ do + let spec = sl<>delims ":"<>sq<>emptyFields<>noslits + let toks = tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + toks `shouldBe` [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing] + + it "splits text using ':' delimeter with single-quotes string and empty fields with noslits and uw" $ do + let spec = delims ":"<>sq<>emptyFields<>uw<>noslits + let toks = tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text] + toks `shouldBe` [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing] + + it "uses punctuation tokens" $ do + let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq + let toks = tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text] + toks `shouldBe` ["(","delimeters",",","are",",","important",",","spaces are not",")",";"] + + + it "tokenize simple lisp-like text with keywords" $ do + let spec = delims " \n\t" <> comment ";" + <> punct "{}()[]<>" + <> sq <> sqq + <> uw + <> keywords ["define","apply","+"] + + let code = [q| + (define add (a b ) ; define simple function + (+ a b) ) + (define r (add 10 20)) +|] + + let toks = tokenize spec code :: [TTok] + + let expected = [ TPunct '(' + , TKeyword "define" + , TText "add" , TPunct '(', TText "a" , TText "b", TPunct ')' + , TPunct '(', TKeyword "+", TText "a",TText "b",TPunct ')',TPunct ')' + , TPunct '(',TKeyword "define" + ,TText "r" + ,TPunct '(',TText "add",TText "10",TText "20" + ,TPunct ')',TPunct ')'] + + toks `shouldBe` expected + + + describe "Checks indentation support" $ do + + let spec = delims " \n\t" <> comment ";" + <> punct "{}()[]<>" + <> sq <> sqq + <> uw + <> indent + <> itabstops 8 + <> keywords ["define"] + + + + it "parses some indented blocks" $ do + + let expected = [ TIndent 0, TKeyword "define", TText "a", TText "0" + , TIndent 2, TText "atom", TText "foo", TText "2" + , TIndent 2, TKeyword "define", TText "aq", TText "2" + , TIndent 4, TText "atom", TText "one", TText "4" + , TIndent 4, TText "atom", TText "two", TText "4" + , TIndent 0, TKeyword "define", TText "b", TText "0" + , TIndent 2, TText "atom", TText "baar", TText "2" + , TIndent 2, TText "atom", TText "quux", TText "2" + , TIndent 2, TKeyword "define", TText "new", TText "2" + , TIndent 6, TText "atom", TText "bar", TText "6" + , TIndent 4, TText "atom", TText "fuu", TText "4" + , TIndent 0 + ] + + let pyLike = [q| +define a 0 + atom foo 2 + define aq 2 + atom one 4 + atom two 4 + +define b 0 + atom baar 2 + atom quux 2 + define new 2 + atom bar 6 + atom fuu 4 + +|] + let toks = tokenize spec pyLike :: [TTok] + toks `shouldBe` expected + + diff --git a/test/Spec.hs b/test/Spec.hs new file mode 100644 index 00000000..a824f8c3 --- /dev/null +++ b/test/Spec.hs @@ -0,0 +1 @@ +{-# OPTIONS_GHC -F -pgmF hspec-discover #-}