mirror of https://github.com/voidlizard/hbs2
Squashed 'miscellaneous/fuzzy-parse/' content from commit a834b152e
git-subtree-dir: miscellaneous/fuzzy-parse git-subtree-split: a834b152e29d632c816eefe117036e5d9330bd03
This commit is contained in:
commit
cf85d2df2a
|
@ -0,0 +1,5 @@
|
|||
if [ -f .envrc.local ]; then
|
||||
source_env .envrc.local
|
||||
fi
|
||||
|
||||
use flake
|
|
@ -0,0 +1,6 @@
|
|||
*.swp
|
||||
dist-newstyle/
|
||||
Setup.hs
|
||||
|
||||
.direnv
|
||||
.hbs2-git/
|
|
@ -0,0 +1,10 @@
|
|||
# Revision history for fuzzy-parse
|
||||
|
||||
## 0.1.2.0
|
||||
|
||||
- Techical release
|
||||
- Added some missed things
|
||||
|
||||
## 0.1.0.0 -- YYYY-mm-dd
|
||||
|
||||
* First version. Released on an unsuspecting world.
|
|
@ -0,0 +1,20 @@
|
|||
Copyright (c) 2019 Dmitry Zuikov
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included
|
||||
in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@ -0,0 +1,131 @@
|
|||
# About
|
||||
|
||||
# Data.Text.Fuzzy.Tokenize
|
||||
|
||||
The lightweight and multi-functional text tokenizer allowing different types of text tokenization
|
||||
depending on it's settings.
|
||||
|
||||
It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
|
||||
easier and sometimes faster than in case of usage mainstream parsing combinators or parser
|
||||
generators.
|
||||
|
||||
The primary goal of this package is to parse unstructured text data, however it may be used for
|
||||
parsing such data formats as CSV with ease.
|
||||
|
||||
Currently it supports the following types of entities: atoms, string literals (currently with the
|
||||
minimal set of escaped characters), punctuation characters and delimeters.
|
||||
|
||||
## Examples
|
||||
|
||||
### Simple CSV-like tokenization
|
||||
|
||||
```haskell
|
||||
tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
|
||||
|
||||
["aaa "," bebeb "," qqq "]
|
||||
```
|
||||
|
||||
```haskell
|
||||
tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
|
||||
|
||||
["aaa "," bebeb "," qqq ","","","",""]
|
||||
```
|
||||
|
||||
```haskell
|
||||
tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
|
||||
|
||||
[Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||
```
|
||||
|
||||
```haskell
|
||||
tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||
|
||||
[Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||
```
|
||||
|
||||
```haskell
|
||||
let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
|
||||
tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||
|
||||
[Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||
```
|
||||
|
||||
```haskell
|
||||
let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
|
||||
tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||
|
||||
[Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
|
||||
```
|
||||
|
||||
### Primitive lisp-like language
|
||||
|
||||
```haskell
|
||||
{-# LANGUAGE QuasiQuotes, ExtendedDefaultRules #-}
|
||||
|
||||
import Text.InterpolatedString.Perl6 (q)
|
||||
import Data.Text.Fuzzy.Tokenize
|
||||
|
||||
data TTok = TChar Char
|
||||
| TSChar Char
|
||||
| TPunct Char
|
||||
| TText Text
|
||||
| TStrLit Text
|
||||
| TKeyword Text
|
||||
| TEmpty
|
||||
deriving(Eq,Ord,Show)
|
||||
|
||||
instance IsToken TTok where
|
||||
mkChar = TChar
|
||||
mkSChar = TSChar
|
||||
mkPunct = TPunct
|
||||
mkText = TText
|
||||
mkStrLit = TStrLit
|
||||
mkKeyword = TKeyword
|
||||
mkEmpty = TEmpty
|
||||
|
||||
main = do
|
||||
|
||||
let spec = delims " \n\t" <> comment ";"
|
||||
<> punct "{}()[]<>"
|
||||
<> sq <> sqq
|
||||
<> uw
|
||||
<> keywords ["define","apply","+"]
|
||||
let code = [q|
|
||||
(define add (a b ) ; define simple function
|
||||
(+ a b) )
|
||||
(define r (add 10 20))
|
||||
|]
|
||||
let toks = tokenize spec code :: [TTok]
|
||||
|
||||
print toks
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
### About the delimeter tokens
|
||||
This type of tokens appears during a "delimited" formats processing and disappears in results.
|
||||
Currenly you will never see it unless normalization is turned off by 'nn' option.
|
||||
|
||||
The delimeters make sense in case of processing the CSV-like formats, but in this case you probably
|
||||
need only values in results.
|
||||
|
||||
This behavior may be changed later. But right now delimeters seem pointless in results. If you
|
||||
process some sort of grammar where delimeter character is important, you may use punctuation
|
||||
instead, i.e:
|
||||
|
||||
```haskell
|
||||
let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
|
||||
tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
|
||||
|
||||
["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
|
||||
```
|
||||
|
||||
### Other
|
||||
For CSV-like formats it makes sense to split text to lines first, otherwise newline characters may
|
||||
cause to weird results
|
||||
|
||||
|
||||
# Authors
|
||||
|
||||
This library is written and maintained by Dmitry Zuikov, dzuikov@gmail.com
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
|
||||
|
||||
- [ ] TODO: Tests for Data.Text.Fuzzy.Section
|
||||
- [ ] TODO: haddock for Data.Text.Fuzzy.Section
|
||||
|
||||
- [~] TODO: Tests
|
||||
- [+] TODO: Freeze dependencies versions
|
||||
- [ ] TODO: Version number
|
||||
- [+] TODO: Haddocks
|
||||
- [ ] TODO: Tokenizer: Identation support
|
||||
- [ ] TODO: Tokenizer: Block comments support
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
packages: *.cabal
|
||||
|
||||
allow-newer: all
|
||||
|
||||
tests: True
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"locked": {
|
||||
"lastModified": 1644229661,
|
||||
"narHash": "sha256-1YdnJAsNy69bpcjuoKdOYQX0YxZBiCYZo4Twxerqv7k=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "3cecb5b042f7f209c56ffd8371b2711a290ec797",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"haskell-flake-utils": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1707809372,
|
||||
"narHash": "sha256-wfTL9PlCSOqSSyU4eenFFI7pHrV21gba4GEILnI4nAU=",
|
||||
"owner": "ivanovs-4",
|
||||
"repo": "haskell-flake-utils",
|
||||
"rev": "3cbdc5d6093e8b4464ae64097e0c8c61e4414ff2",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "ivanovs-4",
|
||||
"repo": "haskell-flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1727089097,
|
||||
"narHash": "sha256-ZMHMThPsthhUREwDebXw7GX45bJnBCVbfnH1g5iuSPc=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "568bfef547c14ca438c56a0bece08b8bb2b71a9c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixpkgs-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"haskell-flake-utils": "haskell-flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
description = "Haskell cabal package";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||
haskell-flake-utils.url = "github:ivanovs-4/haskell-flake-utils";
|
||||
|
||||
# another-simple-haskell-flake.url = "something";
|
||||
|
||||
# some-cabal-pkg.url = "github:example/some-cabal-pkg";
|
||||
# some-cabal-pkg.flake = false;
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, haskell-flake-utils, ... }@inputs:
|
||||
haskell-flake-utils.lib.simpleCabal2flake {
|
||||
inherit self nixpkgs;
|
||||
# systems = [ "x86_64-linux" ];
|
||||
|
||||
# DON'T FORGET TO PUT YOUR PACKAGE NAME HERE, REMOVING `throw`
|
||||
name = "fuzzy-parse";
|
||||
|
||||
shellExtBuildInputs = {pkgs}: with pkgs; [
|
||||
haskellPackages.haskell-language-server
|
||||
];
|
||||
|
||||
# Wether to build hoogle in the default shell
|
||||
shellWithHoogle = true;
|
||||
|
||||
|
||||
## Optional parameters follow
|
||||
|
||||
# nixpkgs config
|
||||
# config = { };
|
||||
|
||||
# Add another haskell flakes as requirements
|
||||
# haskellFlakes = [ inputs.another-simple-haskell-flake ];
|
||||
|
||||
# Use this to load other flakes overlays to supplement nixpkgs
|
||||
# preOverlays = [ ];
|
||||
|
||||
# Pass either a function or a file
|
||||
# preOverlay = ./overlay.nix;
|
||||
|
||||
# Override haskell packages
|
||||
# hpPreOverrides = { pkgs }: new: old:
|
||||
# with pkgs.haskell.lib; with haskell-flake-utils.lib;
|
||||
# tunePackages pkgs old {
|
||||
# some-haskellPackages-package = [ dontHaddock ];
|
||||
# } // {
|
||||
# some-cabal-pkg = ((jailbreakUnbreak pkgs) (dontCheck (old.callCabal2nix "some-cabal-pkg" inputs.some-cabal-pkg {})));
|
||||
# };
|
||||
|
||||
# Arguments for callCabal2nix
|
||||
# cabal2nixArgs = {pkgs}: {
|
||||
# };
|
||||
|
||||
# Maps to the devShell output. Pass in a shell.nix file or function
|
||||
# shell = ./shell.nix
|
||||
|
||||
# Additional build intputs of the default shell
|
||||
# shellExtBuildInputs = {pkgs}: with pkgs; [
|
||||
# haskellPackages.haskell-language-server
|
||||
# ];
|
||||
|
||||
# Wether to build hoogle in the default shell
|
||||
# shellWithHoogle = true;
|
||||
|
||||
};
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
cabal-version: 3.0
|
||||
|
||||
name: fuzzy-parse
|
||||
version: 0.1.3.1
|
||||
synopsis: Tools for processing unstructured text data
|
||||
|
||||
description:
|
||||
The lightweight and easy to use functions for text tokenizing and parsing. It aimed for
|
||||
parsing mostly unstructured data, but the structured formats may be parsed as well.
|
||||
|
||||
It may be used in different sutiations, for DSL, tex markups or even for parsing simple
|
||||
grammars easier and sometimes faster than in case of usage mainstream parsing combinators
|
||||
or parser generators.
|
||||
|
||||
See the README.markdown, examples and modules documentation for more.
|
||||
|
||||
license: MIT
|
||||
license-file: LICENSE
|
||||
author: Dmitry Zuikov
|
||||
maintainer: dzuikov@gmail.com
|
||||
|
||||
category: Text, Parsing
|
||||
extra-source-files: CHANGELOG.md
|
||||
|
||||
homepage: https://github.com/hexresearch/fuzzy-parse
|
||||
bug-reports: https://github.com/hexresearch/fuzzy-parse/issues
|
||||
|
||||
extra-source-files:
|
||||
README.markdown
|
||||
|
||||
common shared-properties
|
||||
|
||||
default-language: GHC2021
|
||||
|
||||
default-extensions:
|
||||
ApplicativeDo
|
||||
, BangPatterns
|
||||
, BlockArguments
|
||||
, ConstraintKinds
|
||||
, DataKinds
|
||||
, DeriveDataTypeable
|
||||
, DeriveGeneric
|
||||
, DerivingStrategies
|
||||
, DerivingVia
|
||||
, ExtendedDefaultRules
|
||||
, FlexibleContexts
|
||||
, FlexibleInstances
|
||||
, GADTs
|
||||
, GeneralizedNewtypeDeriving
|
||||
, ImportQualifiedPost
|
||||
, LambdaCase
|
||||
, MultiParamTypeClasses
|
||||
, OverloadedStrings
|
||||
, QuasiQuotes
|
||||
, RecordWildCards
|
||||
, ScopedTypeVariables
|
||||
, StandaloneDeriving
|
||||
, TemplateHaskell
|
||||
, TupleSections
|
||||
, TypeApplications
|
||||
, TypeFamilies
|
||||
|
||||
|
||||
|
||||
library
|
||||
|
||||
import: shared-properties
|
||||
|
||||
ghc-options:
|
||||
-Wall
|
||||
-fno-warn-type-defaults
|
||||
-O2
|
||||
"-with-rtsopts=-N4 -A64m -AL256m -I0"
|
||||
|
||||
|
||||
exposed-modules: Data.Text.Fuzzy.Tokenize
|
||||
, Data.Text.Fuzzy.Dates
|
||||
, Data.Text.Fuzzy.Section
|
||||
, Data.Text.Fuzzy.SExp
|
||||
, Data.Text.Fuzzy.Attoparsec.Day
|
||||
, Data.Text.Fuzzy.Attoparsec.Month
|
||||
|
||||
build-depends: base
|
||||
, attoparsec
|
||||
, containers
|
||||
, mtl
|
||||
, prettyprinter
|
||||
, safe
|
||||
, streaming
|
||||
, scientific
|
||||
, text
|
||||
, time
|
||||
, microlens-platform
|
||||
, uniplate
|
||||
, unliftio
|
||||
, unordered-containers
|
||||
, timeit
|
||||
|
||||
hs-source-dirs: src
|
||||
|
||||
|
||||
executable fuzzy-sexp-parse
|
||||
|
||||
import: shared-properties
|
||||
default-language: GHC2021
|
||||
|
||||
ghc-options:
|
||||
-Wall
|
||||
-fno-warn-type-defaults
|
||||
-O2
|
||||
|
||||
|
||||
main-is: FuzzySexpParse.hs
|
||||
|
||||
hs-source-dirs: misc
|
||||
|
||||
build-depends: base, fuzzy-parse
|
||||
, containers
|
||||
, hspec
|
||||
, hspec-discover
|
||||
, interpolatedstring-perl6
|
||||
, text
|
||||
, mtl
|
||||
, streaming
|
||||
, transformers
|
||||
, exceptions
|
||||
, uniplate
|
||||
, microlens-platform
|
||||
, safe
|
||||
, timeit
|
||||
, prettyprinter
|
||||
|
||||
|
||||
test-suite fuzzy-parse-test
|
||||
import: shared-properties
|
||||
default-language: GHC2021
|
||||
|
||||
|
||||
type: exitcode-stdio-1.0
|
||||
main-is: Spec.hs
|
||||
other-modules: FuzzyParseSpec
|
||||
hs-source-dirs: test
|
||||
build-depends: base, fuzzy-parse
|
||||
, containers
|
||||
, hspec
|
||||
, hspec-discover
|
||||
, interpolatedstring-perl6
|
||||
, text
|
||||
, mtl
|
||||
, streaming
|
||||
, transformers
|
||||
, exceptions
|
||||
, uniplate
|
||||
, microlens-platform
|
||||
, safe
|
||||
, timeit
|
||||
|
||||
build-tool-depends: hspec-discover:hspec-discover == 2.*
|
||||
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
module Main where
|
||||
|
||||
import Data.Text.Fuzzy.SExp
|
||||
import Data.Text.IO qualified as IO
|
||||
import Data.Text qualified as Text
|
||||
import Data.Either
|
||||
import System.TimeIt
|
||||
import Control.Monad.Except
|
||||
import Data.Functor
|
||||
import Data.Function
|
||||
import Data.Fixed
|
||||
import Prettyprinter
|
||||
import System.IO
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
s <- IO.getContents
|
||||
|
||||
(tt,toks) <- timeItT do
|
||||
pure (tokenizeSexp s)
|
||||
|
||||
(pt,top) <- timeItT do
|
||||
runExceptT (parseTop @() s) <&> either (error.show) id
|
||||
|
||||
print (vcat (fmap pretty top))
|
||||
|
||||
hPrint stderr $ pretty (Text.length s) <+> "chars, parsed in" <+> viaShow (realToFrac pt :: Fixed E6)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"url": "https://github.com/NixOS/nixpkgs.git",
|
||||
"rev": "5cf0de2485efeccc307692eedadbb2d9bfdc7013",
|
||||
"date": "2020-06-04T17:30:37+08:00",
|
||||
"sha256": "07axrr50nlmnvba5ja2ihzjwczi66znak57bhcz472w22w7m3sd1",
|
||||
"fetchSubmodules": false
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
import ((import <nixpkgs> {}).fetchFromGitHub {
|
||||
owner = "NixOS";
|
||||
repo = "nixpkgs";
|
||||
inherit (builtins.fromJSON (builtins.readFile ./pkgs.json)) rev sha256;
|
||||
})
|
|
@ -0,0 +1,29 @@
|
|||
let
|
||||
pkgs = import ./pkgs.nix { inherit config;
|
||||
};
|
||||
lib = pkgs.haskell.lib;
|
||||
config = {
|
||||
packageOverrides = pkgs: rec {
|
||||
haskellPackages = pkgs.haskellPackages.override { overrides = haskOverrides; };
|
||||
};
|
||||
};
|
||||
gitignore = pkgs.callPackage (pkgs.fetchFromGitHub {
|
||||
owner = "siers";
|
||||
repo = "nix-gitignore";
|
||||
rev = "ce0778ddd8b1f5f92d26480c21706b51b1af9166";
|
||||
sha256 = "1d7ab78i2k13lffskb23x8b5h24x7wkdmpvmria1v3wb9pcpkg2w";
|
||||
}) {};
|
||||
ignore = gitignore.gitignoreSourceAux ''
|
||||
.stack-work
|
||||
dist
|
||||
dist-newstyle
|
||||
.ghc.environment*
|
||||
'';
|
||||
haskOverrides = new: old:
|
||||
let overrides = lib.packagesFromDirectory { directory = ./derivations; } new old;
|
||||
in overrides;
|
||||
in rec {
|
||||
inherit pkgs;
|
||||
packages = { inherit (pkgs.haskellPackages) fuzzy-parse;
|
||||
};
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
#!/usr/bin/env nix-shell
|
||||
#! nix-shell -i bash -p nix-prefetch-git
|
||||
nix-prefetch-git https://github.com/NixOS/nixpkgs.git | tee pkgs.json
|
|
@ -0,0 +1,10 @@
|
|||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
dir=$(mktemp -d dist-docs.XXXXXX)
|
||||
trap 'rm -r "$dir"' EXIT
|
||||
|
||||
# assumes cabal 2.4 or later
|
||||
cabal v2-haddock --builddir="$dir" --haddock-for-hackage --enable-doc
|
||||
|
||||
cabal upload -d --publish $dir/*-docs.tar.gz
|
|
@ -0,0 +1,97 @@
|
|||
module Data.Text.Fuzzy.Attoparsec.Day ( dayDMY
|
||||
, dayYMD
|
||||
, dayYYYYMMDD
|
||||
, dayDMonY
|
||||
, day
|
||||
) where
|
||||
|
||||
import Data.List (zipWith)
|
||||
import Control.Applicative ((<|>))
|
||||
import Data.Attoparsec.Text (Parser,decimal,digit,count,satisfy,inClass,skipWhile)
|
||||
import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
|
||||
import qualified Data.Char as Char
|
||||
import qualified Data.Text as Text
|
||||
|
||||
|
||||
|
||||
day :: Parser Day
|
||||
day = dayDMonY <|> dayYYYYMMDD <|> dayYMD <|> dayDMY
|
||||
|
||||
skipDelim :: Parser ()
|
||||
skipDelim = skipWhile (inClass " ./-")
|
||||
|
||||
dayDMY :: Parser Day
|
||||
dayDMY = do
|
||||
d <- decimal :: Parser Int
|
||||
skipDelim
|
||||
m <- decimal :: Parser Int
|
||||
skipDelim
|
||||
y' <- decimal :: Parser Integer
|
||||
maybe (fail "bad date format") pure (makeDay y' m d)
|
||||
|
||||
dayYMD :: Parser Day
|
||||
dayYMD = do
|
||||
y' <- decimal :: Parser Integer
|
||||
skipDelim
|
||||
m <- decimal :: Parser Int
|
||||
skipDelim
|
||||
d <- decimal :: Parser Int
|
||||
maybe (fail "bad date format") pure (makeDay y' m d)
|
||||
|
||||
dayYYYYMMDD :: Parser Day
|
||||
dayYYYYMMDD = do
|
||||
y <- fromIntegral . num n4 . map o <$> count 4 digit
|
||||
m <- num n2 . map o <$> count 2 digit
|
||||
d <- num n2 . map o <$> count 2 digit
|
||||
maybe (fail "bad date format") pure (makeDay y m d)
|
||||
where n4 = [1000,100,10,1]
|
||||
n2 = [10,1]
|
||||
o x = Char.ord x - Char.ord '0'
|
||||
num n x = sum $ zipWith (*) x n
|
||||
|
||||
dayDMonY :: Parser Day
|
||||
dayDMonY = do
|
||||
d <- decimal :: Parser Int
|
||||
skipDelim
|
||||
m <- pMon
|
||||
skipDelim
|
||||
y <- decimal :: Parser Integer
|
||||
maybe (fail "bad date format") pure (makeDay y m d)
|
||||
where
|
||||
pMon :: Parser Int
|
||||
pMon = do
|
||||
txt <- Text.toUpper . Text.pack <$> count 3 (satisfy Char.isLetter)
|
||||
case txt of
|
||||
"JAN" -> pure 1
|
||||
"FEB" -> pure 2
|
||||
"MAR" -> pure 3
|
||||
"APR" -> pure 4
|
||||
"MAY" -> pure 5
|
||||
"JUN" -> pure 6
|
||||
"JUL" -> pure 7
|
||||
"AUG" -> pure 8
|
||||
"SEP" -> pure 9
|
||||
"OCT" -> pure 10
|
||||
"NOV" -> pure 11
|
||||
"DEC" -> pure 12
|
||||
_ -> fail "bad month name"
|
||||
|
||||
|
||||
makeYear :: Integer -> Maybe Integer
|
||||
makeYear y' = if y < 1900 && y' < 99
|
||||
then Nothing
|
||||
else pure y
|
||||
where
|
||||
y = if y' < 50
|
||||
then y' + 2000
|
||||
else (if y' >= 50 && y' <= 99
|
||||
then y' + 1900
|
||||
else y' )
|
||||
|
||||
makeDay :: Integer -> Int -> Int -> Maybe Day
|
||||
makeDay y m d | m <= 12 && m > 0 =
|
||||
makeYear y >>= \yyyy -> if d <= gregorianMonthLength yyyy m
|
||||
then pure $ fromGregorian yyyy m d
|
||||
else Nothing
|
||||
|
||||
| otherwise = Nothing
|
|
@ -0,0 +1,46 @@
|
|||
module Data.Text.Fuzzy.Attoparsec.Month ( fuzzyMonth, fuzzyMonthFromText
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>))
|
||||
import Data.Attoparsec.Text (Parser,decimal,digit,letter,many1,parseOnly)
|
||||
import Data.Map (Map)
|
||||
import Data.Maybe
|
||||
import Data.Text (Text)
|
||||
import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
|
||||
import qualified Data.Char as Char
|
||||
import qualified Data.Map as Map
|
||||
import qualified Data.Text as Text
|
||||
|
||||
|
||||
fuzzyMonth :: Parser Int
|
||||
fuzzyMonth = pMonthNum <|> pMonth
|
||||
|
||||
fuzzyMonthFromText :: Text -> Maybe Int
|
||||
fuzzyMonthFromText = either (const Nothing) Just . parseOnly fuzzyMonth
|
||||
|
||||
pMonthNum :: Parser Int
|
||||
pMonthNum = do
|
||||
n <- decimal
|
||||
if n >= 1 && n <= 13
|
||||
then pure n
|
||||
else fail "invalid months num"
|
||||
|
||||
pMonth :: Parser Int
|
||||
pMonth = do
|
||||
mo <- many1 (Char.toLower <$> letter)
|
||||
maybe (fail "invalid month name") pure (Map.lookup mo months)
|
||||
where
|
||||
months :: Map String Int
|
||||
months = Map.fromList [ ("jan", 1), ("january" , 1)
|
||||
, ("feb", 2), ("febuary" , 2)
|
||||
, ("mar", 3), ("march" , 3)
|
||||
, ("apr", 4), ("april" , 4)
|
||||
, ("may", 5), ("may" , 5)
|
||||
, ("jun", 6), ("june" , 6)
|
||||
, ("jul", 7), ("july" , 7)
|
||||
, ("aug", 8), ("august" , 8)
|
||||
, ("sep", 9), ("september", 9)
|
||||
, ("oct", 10), ("october" , 10)
|
||||
, ("nov", 11), ("november" , 11)
|
||||
, ("dec", 12), ("december" , 12)
|
||||
]
|
|
@ -0,0 +1,46 @@
|
|||
-- |
|
||||
-- Module : Data.Text.Fuzzy.Dates
|
||||
-- Copyright : Dmitry Zuikov 2020
|
||||
-- License : MIT
|
||||
--
|
||||
-- Maintainer : dzuikov@gmail.com
|
||||
-- Stability : experimental
|
||||
-- Portability : unknown
|
||||
--
|
||||
-- Dates fuzzy parsing.
|
||||
-- Supports a number of dates format and tries to recover
|
||||
-- the incomplete dates from text, with use of some
|
||||
-- reasonable assumptions. Does not support locales,
|
||||
-- i.e assums only English for dates yet.
|
||||
--
|
||||
-- == Examples
|
||||
--
|
||||
-- > parseMaybeDay "01.01.1979"
|
||||
-- > Just 1979-01-01
|
||||
-- > parseMaybeDay "01.01.01"
|
||||
-- > Just 2001-01-01
|
||||
-- > parseMaybeDay "13/01/2019"
|
||||
-- > Just 2019-01-13
|
||||
-- > parseMaybeDay "2019-12-1"
|
||||
-- > Just 2019-12-01
|
||||
-- > parseMaybeDay "21-feb-79"
|
||||
-- > Just 1979-02-21
|
||||
-- > parseMaybeDay "21-feb-01"
|
||||
-- > Just 2001-02-21
|
||||
-- > parseMaybeDay "29feb04"
|
||||
-- > Just 2004-02-29
|
||||
-- > parseMaybeDay "21feb28"
|
||||
-- > Just 2028-02-21
|
||||
|
||||
module Data.Text.Fuzzy.Dates where
|
||||
|
||||
import Data.Attoparsec.Text (parseOnly)
|
||||
import Data.Either (either)
|
||||
import Data.Text.Fuzzy.Attoparsec.Day
|
||||
import Data.Text (Text)
|
||||
import Data.Time.Calendar
|
||||
|
||||
-- | Tries to parse a date from the text.
|
||||
parseMaybeDay :: Text -> Maybe Day
|
||||
parseMaybeDay s = either (const Nothing) pure (parseOnly day s)
|
||||
|
|
@ -0,0 +1,378 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE QuasiQuotes #-}
|
||||
{-# LANGUAGE ExtendedDefaultRules #-}
|
||||
{-# LANGUAGE LambdaCase #-}
|
||||
{-# LANGUAGE ImportQualifiedPost #-}
|
||||
{-# LANGUAGE DerivingStrategies #-}
|
||||
{-# LANGUAGE PatternSynonyms #-}
|
||||
{-# LANGUAGE ViewPatterns #-}
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Text.Fuzzy.SExp where
|
||||
|
||||
import Data.Text (Text)
|
||||
|
||||
import Control.Applicative
|
||||
import Control.Monad
|
||||
import Data.Function
|
||||
import Data.Functor
|
||||
import Data.Text.Fuzzy.Tokenize
|
||||
import Control.Monad.Reader
|
||||
import Data.Typeable
|
||||
import Control.Monad.Except
|
||||
import Control.Monad.RWS
|
||||
import Data.Maybe
|
||||
import Data.Char (isSpace,digitToInt)
|
||||
import Data.Generics.Uniplate.Data()
|
||||
import Safe
|
||||
import Data.Data
|
||||
import GHC.Generics
|
||||
import Lens.Micro.Platform
|
||||
import Data.Text qualified as Text
|
||||
import Data.Coerce
|
||||
import Data.Scientific
|
||||
|
||||
import Data.HashMap.Strict (HashMap)
|
||||
import Data.HashMap.Strict qualified as HM
|
||||
|
||||
import Prettyprinter hiding (braces,list)
|
||||
|
||||
|
||||
import Streaming.Prelude qualified as S
|
||||
|
||||
data TTok = TChar Char
|
||||
| TSChar Char
|
||||
| TPunct Char
|
||||
| TText Text
|
||||
| TStrLit Text
|
||||
| TKeyword Text
|
||||
| TEmpty
|
||||
| TIndent Int
|
||||
deriving stock (Eq,Ord,Show,Data,Generic)
|
||||
|
||||
instance IsToken TTok where
|
||||
mkChar = TChar
|
||||
mkSChar = TSChar
|
||||
mkPunct = TPunct
|
||||
mkText = TText
|
||||
mkStrLit = TStrLit
|
||||
mkKeyword = TKeyword
|
||||
mkEmpty = TEmpty
|
||||
mkIndent = TIndent
|
||||
|
||||
newtype C0 = C0 (Maybe Int)
|
||||
deriving stock (Eq,Ord,Show,Data,Typeable,Generic)
|
||||
|
||||
data SExpParseError =
|
||||
ParensOver C0
|
||||
| ParensUnder C0
|
||||
| ParensUnmatched C0
|
||||
| SyntaxError C0
|
||||
deriving stock (Show,Typeable)
|
||||
|
||||
|
||||
data NumType =
|
||||
NumInteger Integer
|
||||
| NumDouble Scientific
|
||||
deriving stock (Eq,Ord,Show,Data,Generic)
|
||||
|
||||
class Monoid c => ForMicroSexp c where
|
||||
|
||||
instance Monoid C0 where
|
||||
mempty = C0 Nothing
|
||||
|
||||
instance Semigroup C0 where
|
||||
(<>) (C0 a) (C0 b) = C0 (b <|> a)
|
||||
|
||||
instance ForMicroSexp C0 where
|
||||
|
||||
|
||||
instance ForMicroSexp () where
|
||||
|
||||
data MicroSexp c =
|
||||
List_ c [MicroSexp c]
|
||||
| Symbol_ c Text
|
||||
| String_ c Text
|
||||
| Number_ c NumType
|
||||
| Boolean_ c Bool
|
||||
deriving stock (Show,Data,Generic)
|
||||
|
||||
pattern List :: ForMicroSexp c => [MicroSexp c] -> MicroSexp c
|
||||
pattern List xs <- List_ _ xs where
|
||||
List xs = List_ mempty xs
|
||||
|
||||
pattern Symbol :: ForMicroSexp c => Text -> MicroSexp c
|
||||
pattern Symbol xs <- Symbol_ _ xs where
|
||||
Symbol xs = Symbol_ mempty xs
|
||||
|
||||
pattern String :: ForMicroSexp c => Text -> MicroSexp c
|
||||
pattern String x <- String_ _ x where
|
||||
String x = String_ mempty x
|
||||
|
||||
pattern Number :: ForMicroSexp c => NumType -> MicroSexp c
|
||||
pattern Number n <- Number_ _ n where
|
||||
Number n = Number_ mempty n
|
||||
|
||||
pattern Boolean :: ForMicroSexp c => Bool -> MicroSexp c
|
||||
pattern Boolean b <- Boolean_ _ b where
|
||||
Boolean b = Boolean_ mempty b
|
||||
|
||||
{-# COMPLETE List, Symbol, String, Number, Boolean #-}
|
||||
|
||||
|
||||
contextOf :: Lens (MicroSexp c) (MicroSexp c) c c
|
||||
contextOf = lens g s
|
||||
where
|
||||
s sexp c = case sexp of
|
||||
List_ _ a -> List_ c a
|
||||
Symbol_ _ a -> Symbol_ c a
|
||||
String_ _ a -> String_ c a
|
||||
Number_ _ a -> Number_ c a
|
||||
Boolean_ _ a -> Boolean_ c a
|
||||
|
||||
g = \case
|
||||
List_ c _ -> c
|
||||
Symbol_ c _ -> c
|
||||
String_ c _ -> c
|
||||
Number_ c _ -> c
|
||||
Boolean_ c _ -> c
|
||||
|
||||
nil :: forall c . ForMicroSexp c => MicroSexp c
|
||||
nil = List []
|
||||
|
||||
symbol :: forall c . ForMicroSexp c => Text -> MicroSexp c
|
||||
symbol = Symbol
|
||||
|
||||
str :: forall c . ForMicroSexp c => Text -> MicroSexp c
|
||||
str = String
|
||||
|
||||
newtype SExpEnv =
|
||||
SExpEnv
|
||||
{ sexpTranslate :: Bool
|
||||
}
|
||||
|
||||
data SExpState =
|
||||
SExpState
|
||||
{ _sexpLno :: Int
|
||||
, _sexpBraces :: [Char]
|
||||
}
|
||||
|
||||
makeLenses 'SExpState
|
||||
|
||||
defEnv :: SExpEnv
|
||||
defEnv = SExpEnv True
|
||||
|
||||
newtype SExpM m a = SExpM { fromSexpM :: RWST SExpEnv () SExpState m a }
|
||||
deriving newtype
|
||||
( Applicative
|
||||
, Functor
|
||||
, Monad
|
||||
, MonadState SExpState
|
||||
, MonadReader SExpEnv
|
||||
, MonadTrans
|
||||
)
|
||||
|
||||
|
||||
instance MonadError SExpParseError m => MonadError SExpParseError (SExpM m) where
|
||||
throwError = lift . throwError
|
||||
catchError w = catchError (coerce $ fromSexpM w)
|
||||
|
||||
tokenizeSexp :: Text -> [TTok]
|
||||
tokenizeSexp txt = do
|
||||
let spec = delims " \r\t" <> comment ";"
|
||||
<> punct "'{}()[]\n"
|
||||
<> sqq
|
||||
<> uw
|
||||
tokenize spec txt
|
||||
|
||||
runSexpM :: Monad m => SExpM m a -> m a
|
||||
runSexpM f = evalRWST (fromSexpM f) defEnv (SExpState 0 []) <&> fst
|
||||
|
||||
|
||||
parseSexp :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m (MicroSexp c)
|
||||
parseSexp txt = do
|
||||
(s, _) <- runSexpM do
|
||||
(s,rest) <- sexp (tokenizeSexp txt)
|
||||
checkBraces
|
||||
pure (s,rest)
|
||||
|
||||
pure s
|
||||
|
||||
checkBraces :: (MonadError SExpParseError m) => SExpM m ()
|
||||
checkBraces = do
|
||||
braces <- gets (view sexpBraces)
|
||||
unless (null braces) $ raiseWith ParensUnder
|
||||
|
||||
succLno :: (MonadError SExpParseError m) => SExpM m ()
|
||||
succLno = modify (over sexpLno succ)
|
||||
|
||||
parseTop :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m [MicroSexp c]
|
||||
parseTop txt = do
|
||||
let tokens = tokenizeSexp txt
|
||||
S.toList_ $ runSexpM do
|
||||
flip fix (mempty,tokens) $ \next -> \case
|
||||
(acc, []) -> do
|
||||
emit acc
|
||||
(acc, TPunct '\n' : rest) -> do
|
||||
succLno
|
||||
emit acc
|
||||
next (mempty,rest)
|
||||
(acc, rest) -> do
|
||||
(s, xs) <- sexp rest
|
||||
next (acc <> [s],xs)
|
||||
|
||||
where
|
||||
|
||||
emit [] = pure ()
|
||||
emit wtf = case wtf of
|
||||
[List one] -> lift $ S.yield (List one)
|
||||
xs -> lift $ S.yield (List xs)
|
||||
|
||||
sexp :: (ForMicroSexp c, MonadError SExpParseError m) => [TTok] -> SExpM m (MicroSexp c, [TTok])
|
||||
sexp s = case s of
|
||||
[] -> do
|
||||
checkBraces
|
||||
pure (nil, mempty)
|
||||
|
||||
(TText l : w) -> (,w) <$> trNum (Symbol l)
|
||||
|
||||
(TStrLit l : w) -> pure (String l, w)
|
||||
|
||||
-- so far ignored
|
||||
(TPunct '\'' : rest) -> sexp rest
|
||||
|
||||
(TPunct '\n' : rest) -> succLno >> sexp rest
|
||||
|
||||
(TPunct c : rest) | isSpace c -> sexp rest
|
||||
|
||||
(TPunct c : rest) | isBrace c ->
|
||||
maybe (pure (nil, rest)) (`list` rest) (closing c)
|
||||
| otherwise -> do
|
||||
raiseWith ParensOver
|
||||
|
||||
( _ : _ ) -> raiseWith SyntaxError
|
||||
|
||||
where
|
||||
|
||||
setContext w = do
|
||||
co <- getC0
|
||||
pure $ over _2 (set contextOf co) w
|
||||
|
||||
isBrace :: Char -> Bool
|
||||
isBrace c = HM.member c braces
|
||||
|
||||
closing :: Char -> Maybe Char
|
||||
closing c = HM.lookup c braces
|
||||
|
||||
braces :: HashMap Char Char
|
||||
braces = HM.fromList[ ('{', '}')
|
||||
, ('(', ')')
|
||||
, ('[', ']')
|
||||
, ('<', '>')
|
||||
]
|
||||
|
||||
cBraces :: [Char]
|
||||
cBraces = HM.elems braces
|
||||
|
||||
trNum tok = do
|
||||
|
||||
trans <- asks sexpTranslate
|
||||
|
||||
case tok of
|
||||
Symbol s | trans -> do
|
||||
let s0 = Text.unpack s
|
||||
|
||||
let what = Number . NumInteger <$> readMay @Integer s0
|
||||
<|>
|
||||
Number . NumInteger <$> parseBinary s0
|
||||
<|>
|
||||
Number . NumDouble <$> readMay @Scientific s0
|
||||
<|>
|
||||
( case s of
|
||||
"#t" -> Just (Boolean True)
|
||||
"#f" -> Just (Boolean False)
|
||||
_ -> Nothing
|
||||
)
|
||||
|
||||
pure $ fromMaybe (Symbol s) what
|
||||
|
||||
|
||||
x -> pure x
|
||||
{-# INLINE trNum #-}
|
||||
|
||||
list :: (ForMicroSexp c, MonadError SExpParseError m)
|
||||
=> Char
|
||||
-> [TTok]
|
||||
-> SExpM m (MicroSexp c, [TTok])
|
||||
|
||||
list _ [] = raiseWith ParensUnder
|
||||
|
||||
list cb tokens = do
|
||||
modify $ over sexpBraces (cb:)
|
||||
|
||||
go cb mempty tokens
|
||||
|
||||
where
|
||||
|
||||
isClosingFor :: Char -> Bool
|
||||
isClosingFor c = c `elem` cBraces
|
||||
|
||||
go _ _ [] = do
|
||||
checkBraces
|
||||
pure (List mempty, mempty)
|
||||
|
||||
go cl acc (TPunct c : rest) | isSpace c = do
|
||||
go cl acc rest
|
||||
|
||||
go cl acc (TPunct c : rest)
|
||||
| isClosingFor c && c == cl = do
|
||||
modify $ over sexpBraces (drop 1)
|
||||
pure (List (reverse acc), rest)
|
||||
|
||||
| isClosingFor c && c /= cl = do
|
||||
raiseWith ParensUnmatched
|
||||
-- throwError =<< ParensUnmatched <$> undefined
|
||||
|
||||
go cl acc rest = do
|
||||
(e,r) <- sexp rest
|
||||
go cl (e : acc) r
|
||||
|
||||
|
||||
getC0 :: Monad m => SExpM m C0
|
||||
getC0 = do
|
||||
lno <- gets (view sexpLno)
|
||||
pure (C0 (Just lno))
|
||||
|
||||
raiseWith :: (MonadError SExpParseError m)
|
||||
=> (C0 -> SExpParseError) -> SExpM m b
|
||||
|
||||
raiseWith a = throwError =<< a <$> getC0
|
||||
|
||||
instance Pretty NumType where
|
||||
pretty = \case
|
||||
NumInteger n -> pretty n
|
||||
NumDouble n -> viaShow n
|
||||
|
||||
instance ForMicroSexp c => Pretty (MicroSexp c) where
|
||||
|
||||
pretty = \case
|
||||
List xs -> parens (hsep (fmap pretty xs))
|
||||
String s -> dquotes (pretty s)
|
||||
Symbol s -> pretty s
|
||||
Number n -> pretty n
|
||||
Boolean True -> pretty "#t"
|
||||
Boolean False -> pretty "#f"
|
||||
|
||||
isBinaryDigit :: Char -> Bool
|
||||
isBinaryDigit c = c == '0' || c == '1'
|
||||
|
||||
parseBinary :: String -> Maybe Integer
|
||||
parseBinary str =
|
||||
let
|
||||
withoutPrefix = case str of
|
||||
'0':'b':rest -> Just rest
|
||||
'0':'B':rest -> Just rest
|
||||
_ -> Nothing
|
||||
in if isJust withoutPrefix && all isBinaryDigit (fromJust withoutPrefix)
|
||||
then Just $ foldl (\acc x -> acc * 2 + toInteger (digitToInt x)) 0 (fromJust withoutPrefix)
|
||||
else Nothing
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
module Data.Text.Fuzzy.Section (cutSectionBy, cutSectionOn) where
|
||||
|
||||
import Data.Text (Text)
|
||||
import qualified Data.List as List
|
||||
|
||||
|
||||
cutSectionOn :: Text -> Text -> [Text] -> [Text]
|
||||
cutSectionOn a b txt = cutSectionBy ((==)a) ((==b)) txt
|
||||
|
||||
cutSectionBy :: (Text -> Bool) -> (Text -> Bool) -> [Text] -> [Text]
|
||||
cutSectionBy a b txt = cutI
|
||||
where
|
||||
cutC = List.dropWhile (not . a) txt
|
||||
cutI = List.takeWhile (not . b) cutC
|
||||
|
|
@ -0,0 +1,556 @@
|
|||
-- |
|
||||
-- Module : Data.Text.Fuzzy.Tokenize
|
||||
-- Copyright : Dmitry Zuikov 2020
|
||||
-- License : MIT
|
||||
--
|
||||
-- Maintainer : dzuikov@gmail.com
|
||||
-- Stability : experimental
|
||||
-- Portability : unknown
|
||||
--
|
||||
-- The lightweight and multi-functional text tokenizer allowing different types of text tokenization
|
||||
-- depending on it's settings.
|
||||
--
|
||||
-- It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
|
||||
-- easier and sometimes faster than in case of usage mainstream parsing combinators or parser
|
||||
-- generators.
|
||||
--
|
||||
-- The primary goal of this package is to parse unstructured text data, however it may be used for
|
||||
-- parsing such data formats as CSV with ease.
|
||||
--
|
||||
-- Currently it supports the following types of entities: atoms, string literals (currently with the
|
||||
-- minimal set of escaped characters), punctuation characters and delimeters.
|
||||
--
|
||||
-- == Examples
|
||||
-- === Simple CSV-like tokenization
|
||||
-- >>> tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
|
||||
-- ["aaa "," bebeb "," qqq "]
|
||||
--
|
||||
-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
|
||||
-- ["aaa "," bebeb "," qqq ","","","",""]
|
||||
--
|
||||
-- >>>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
|
||||
-- [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||
--
|
||||
-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||
-- [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||
--
|
||||
-- >>> let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
|
||||
-- >>> tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||
-- [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||
--
|
||||
-- >>> let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
|
||||
-- >>> tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||
-- [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
|
||||
--
|
||||
-- == Notes
|
||||
--
|
||||
-- === About the delimeter tokens
|
||||
-- This type of tokens appears during a "delimited"
|
||||
-- formats processing and disappears in results. Currenly
|
||||
-- you will never see it unless normalization is turned off by 'nn' option.
|
||||
--
|
||||
-- The delimeters make sense in case of processing the CSV-like formats,
|
||||
-- but in this case you probably need only values in results.
|
||||
--
|
||||
-- This behavior may be changed later. But right now delimeters seem pointless
|
||||
-- in results. If you process some sort of grammar where delimeter character
|
||||
-- is important, you may use punctuation instead, i.e:
|
||||
--
|
||||
-- >>> let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
|
||||
-- >>> tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
|
||||
-- ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
|
||||
--
|
||||
-- == Other
|
||||
-- For CSV-like formats it makes sense to split text to lines first,
|
||||
-- otherwise newline characters may cause to weird results
|
||||
--
|
||||
--
|
||||
|
||||
module Data.Text.Fuzzy.Tokenize ( TokenizeSpec
|
||||
, IsToken(..)
|
||||
, tokenize
|
||||
, esc
|
||||
, addEmptyFields
|
||||
, emptyFields
|
||||
, nn
|
||||
, sq
|
||||
, sqq
|
||||
, noslits
|
||||
, sl
|
||||
, sr
|
||||
, uw
|
||||
, delims
|
||||
, comment
|
||||
, punct
|
||||
, indent
|
||||
, itabstops
|
||||
, keywords
|
||||
, eol
|
||||
) where
|
||||
|
||||
import Prelude hiding (init)
|
||||
|
||||
import Control.Applicative
|
||||
import Data.Map (Map)
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Monoid()
|
||||
import Data.Set (Set)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.List as List
|
||||
import qualified Data.Map as Map
|
||||
import qualified Data.Set as Set
|
||||
import qualified Data.Text as Text
|
||||
|
||||
import Control.Monad (when)
|
||||
import Control.Monad.RWS
|
||||
|
||||
-- | Tokenization settings. Use mempty for an empty value
|
||||
-- and construction functions for changing the settings.
|
||||
--
|
||||
data TokenizeSpec = TokenizeSpec { tsAtoms :: Set Text
|
||||
, tsStringQQ :: Maybe Bool
|
||||
, tsStringQ :: Maybe Bool
|
||||
, tsNoSlits :: Maybe Bool
|
||||
, tsLineComment :: Map Char Text
|
||||
, tsDelims :: Set Char
|
||||
, tsEol :: Maybe Bool
|
||||
, tsStripLeft :: Maybe Bool
|
||||
, tsStripRight :: Maybe Bool
|
||||
, tsUW :: Maybe Bool
|
||||
, tsNotNormalize :: Maybe Bool
|
||||
, tsEsc :: Maybe Bool
|
||||
, tsAddEmptyFields :: Maybe Bool
|
||||
, tsPunct :: Set Char
|
||||
, tsIndent :: Maybe Bool
|
||||
, tsItabStops :: Maybe Int
|
||||
, tsKeywords :: Set Text
|
||||
}
|
||||
deriving (Eq,Ord,Show)
|
||||
|
||||
|
||||
instance Semigroup TokenizeSpec where
|
||||
(<>) a b = TokenizeSpec { tsAtoms = tsAtoms b <> tsAtoms a
|
||||
, tsStringQQ = tsStringQQ b <|> tsStringQQ a
|
||||
, tsStringQ = tsStringQ b <|> tsStringQ a
|
||||
, tsNoSlits = tsNoSlits b <|> tsNoSlits a
|
||||
, tsLineComment = tsLineComment b <> tsLineComment a
|
||||
, tsDelims = tsDelims b <> tsDelims a
|
||||
, tsEol = tsEol b <|> tsEol a
|
||||
, tsStripLeft = tsStripLeft b <|> tsStripLeft a
|
||||
, tsStripRight = tsStripRight b <|> tsStripRight a
|
||||
, tsUW = tsUW b <|> tsUW a
|
||||
, tsNotNormalize = tsNotNormalize b <|> tsNotNormalize a
|
||||
, tsEsc = tsEsc b <|> tsEsc a
|
||||
, tsAddEmptyFields = tsAddEmptyFields b <|> tsAddEmptyFields a
|
||||
, tsPunct = tsPunct b <> tsPunct a
|
||||
, tsIndent = tsIndent b <|> tsIndent a
|
||||
, tsItabStops = tsItabStops b <|> tsItabStops a
|
||||
, tsKeywords = tsKeywords b <> tsKeywords a
|
||||
}
|
||||
|
||||
instance Monoid TokenizeSpec where
|
||||
mempty = TokenizeSpec { tsAtoms = mempty
|
||||
, tsStringQQ = Nothing
|
||||
, tsStringQ = Nothing
|
||||
, tsNoSlits = Nothing
|
||||
, tsLineComment = mempty
|
||||
, tsDelims = mempty
|
||||
, tsEol = Nothing
|
||||
, tsStripLeft = Nothing
|
||||
, tsStripRight = Nothing
|
||||
, tsUW = Nothing
|
||||
, tsNotNormalize = Nothing
|
||||
, tsEsc = Nothing
|
||||
, tsAddEmptyFields = Nothing
|
||||
, tsPunct = mempty
|
||||
, tsIndent = Nothing
|
||||
, tsItabStops = Nothing
|
||||
, tsKeywords = mempty
|
||||
}
|
||||
|
||||
|
||||
justTrue :: Maybe Bool -> Bool
|
||||
justTrue (Just True) = True
|
||||
justTrue _ = False
|
||||
|
||||
-- | Turns on EOL token generation
|
||||
eol :: TokenizeSpec
|
||||
eol = mempty { tsEol = pure True }
|
||||
|
||||
-- | Turn on character escaping inside string literals.
|
||||
-- Currently the following escaped characters are
|
||||
-- supported: [" ' \ t n r a b f v ]
|
||||
esc :: TokenizeSpec
|
||||
esc = mempty { tsEsc = pure True }
|
||||
|
||||
-- | Raise empty field tokens (note mkEmpty method)
|
||||
-- when no tokens found before a delimeter.
|
||||
-- Useful for processing CSV-like data in
|
||||
-- order to distingush empty columns
|
||||
addEmptyFields :: TokenizeSpec
|
||||
addEmptyFields = mempty { tsAddEmptyFields = pure True }
|
||||
|
||||
-- | same as addEmptyFields
|
||||
emptyFields :: TokenizeSpec
|
||||
emptyFields = addEmptyFields
|
||||
|
||||
-- | Turns off token normalization. Makes the tokenizer
|
||||
-- generate character stream. Useful for debugging.
|
||||
nn :: TokenizeSpec
|
||||
nn = mempty { tsNotNormalize = pure True }
|
||||
|
||||
-- | Turns on single-quoted string literals.
|
||||
-- Character stream after '\'' character
|
||||
-- will be proceesed as single-quoted stream,
|
||||
-- assuming all delimeter, comment and other special
|
||||
-- characters as a part of the string literal until
|
||||
-- the next unescaped single quote character.
|
||||
sq :: TokenizeSpec
|
||||
sq = mempty { tsStringQ = pure True }
|
||||
|
||||
-- | Enable double-quoted string literals support
|
||||
-- as 'sq' for single-quoted strings.
|
||||
sqq :: TokenizeSpec
|
||||
sqq = mempty { tsStringQQ = pure True }
|
||||
|
||||
-- | Disable separate string literals.
|
||||
--
|
||||
-- Useful when processed delimeted data (csv-like formats).
|
||||
-- Normally, sequential text chunks are concatenated together,
|
||||
-- but consequent text and string literal will produce the two
|
||||
-- different tokens and it may cause weird results if data
|
||||
-- is in csv-like format, i.e:
|
||||
--
|
||||
-- >>> tokenize (delims ":"<>emptyFields<>sq ) "aaa:bebe:'qq' aaa:next::" :: [Maybe Text]
|
||||
-- [Just "aaa",Just "bebe",Just "qq",Just " aaa",Just "next",Nothing,Nothing]
|
||||
--
|
||||
-- look: "qq" and " aaa" are turned into two separate tokens that makes the result
|
||||
-- of CSV processing looks improper, like it has an extra-column. This behavior may be
|
||||
-- avoided using this option, if you don't need to distinguish text chunks and string
|
||||
-- literals:
|
||||
--
|
||||
-- >>> tokenize (delims ":"<>emptyFields<>sq<>noslits) "aaa:bebe:'qq:foo' aaa:next::" :: [Maybe Text]
|
||||
-- [Just "aaa",Just "bebe",Just "qq:foo aaa",Just "next",Nothing,Nothing]
|
||||
--
|
||||
noslits :: TokenizeSpec
|
||||
noslits = mempty { tsNoSlits = pure True }
|
||||
|
||||
-- | Specify the list of delimers (characters)
|
||||
-- to split the character stream into fields. Useful for CSV-like separated formats. Support for
|
||||
-- empty fields in token stream may be enabled by 'addEmptyFields' function
|
||||
delims :: String -> TokenizeSpec
|
||||
delims s = mempty { tsDelims = Set.fromList s }
|
||||
|
||||
-- | Strip spaces on left side of a token.
|
||||
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
|
||||
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
|
||||
sl :: TokenizeSpec
|
||||
sl = mempty { tsStripLeft = pure True }
|
||||
|
||||
-- | Strip spaces on right side of a token.
|
||||
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
|
||||
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
|
||||
sr :: TokenizeSpec
|
||||
sr = mempty { tsStripRight = pure True }
|
||||
|
||||
-- | Strips spaces on right and left sides and transforms multiple spaces into the one.
|
||||
-- Name origins from unwords . words
|
||||
--
|
||||
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
|
||||
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
|
||||
uw :: TokenizeSpec
|
||||
uw = mempty { tsUW = pure True }
|
||||
|
||||
-- | Specify the line comment prefix.
|
||||
-- All text after the line comment prefix will
|
||||
-- be ignored until the newline character appearance.
|
||||
-- Multiple line comments are supported.
|
||||
comment :: Text -> TokenizeSpec
|
||||
comment s = mempty { tsLineComment = cmt }
|
||||
where
|
||||
cmt = case Text.uncons s of
|
||||
Just (p,su) -> Map.singleton p su
|
||||
Nothing -> mempty
|
||||
|
||||
-- | Specify the punctuation characters.
|
||||
-- Any punctuation character is handled as a separate
|
||||
-- token.
|
||||
-- Any token will be breaked on a punctiation character.
|
||||
--
|
||||
-- Useful for handling ... er... punctuaton, like
|
||||
--
|
||||
-- >> function(a,b)
|
||||
--
|
||||
-- or
|
||||
--
|
||||
-- >> (apply function 1 2 3)
|
||||
--
|
||||
--
|
||||
-- >>> tokenize spec "(apply function 1 2 3)" :: [Text]
|
||||
-- ["(","apply","function","1","2","3",")"]
|
||||
--
|
||||
punct :: Text -> TokenizeSpec
|
||||
punct s = mempty { tsPunct = Set.fromList (Text.unpack s) }
|
||||
|
||||
-- | Specify the keywords list.
|
||||
-- Each keyword will be threated as a separate token.
|
||||
keywords :: [Text] -> TokenizeSpec
|
||||
keywords s = mempty { tsKeywords = Set.fromList s }
|
||||
|
||||
-- | Enable identation support
|
||||
indent :: TokenizeSpec
|
||||
indent = mempty { tsIndent = Just True }
|
||||
|
||||
-- | Set tab expanding multiplier
|
||||
-- i.e. each tab extends into n spaces before processing.
|
||||
-- It also turns on the indentation. Only the tabs at the beginning of the string are expanded,
|
||||
-- i.e. before the first non-space character appears.
|
||||
itabstops :: Int -> TokenizeSpec
|
||||
itabstops n = mempty { tsIndent = Just True, tsItabStops = pure n }
|
||||
|
||||
newtype TokenizeM w a = TokenizeM (RWS TokenizeSpec w () a)
|
||||
deriving( Applicative
|
||||
, Functor
|
||||
, MonadReader TokenizeSpec
|
||||
, MonadWriter w
|
||||
, MonadState ()
|
||||
, Monad
|
||||
)
|
||||
|
||||
data Token = TChar Char
|
||||
| TSChar Char
|
||||
| TPunct Char
|
||||
| TText Text
|
||||
| TSLit Text
|
||||
| TKeyword Text
|
||||
| TEmpty
|
||||
| TDelim
|
||||
| TIndent Int
|
||||
| TEol
|
||||
deriving (Eq,Ord,Show)
|
||||
|
||||
-- | Typeclass for token values.
|
||||
-- Note, that some tokens appear in results
|
||||
-- only when 'nn' option is set, i.e. sequences
|
||||
-- of characters turn out to text tokens or string literals
|
||||
-- and delimeter tokens are just removed from the
|
||||
-- results
|
||||
class IsToken a where
|
||||
-- | Create a character token
|
||||
mkChar :: Char -> a
|
||||
-- | Create a string literal character token
|
||||
mkSChar :: Char -> a
|
||||
-- | Create a punctuation token
|
||||
mkPunct :: Char -> a
|
||||
-- | Create a text chunk token
|
||||
mkText :: Text -> a
|
||||
-- | Create a string literal token
|
||||
mkStrLit :: Text -> a
|
||||
-- | Create a keyword token
|
||||
mkKeyword :: Text -> a
|
||||
-- | Create an empty field token
|
||||
mkEmpty :: a
|
||||
-- | Create a delimeter token
|
||||
mkDelim :: a
|
||||
mkDelim = mkEmpty
|
||||
|
||||
-- | Creates an indent token
|
||||
mkIndent :: Int -> a
|
||||
mkIndent = const mkEmpty
|
||||
|
||||
-- | Creates an EOL token
|
||||
mkEol :: a
|
||||
mkEol = mkEmpty
|
||||
|
||||
instance IsToken (Maybe Text) where
|
||||
mkChar = pure . Text.singleton
|
||||
mkSChar = pure . Text.singleton
|
||||
mkPunct = pure . Text.singleton
|
||||
mkText = pure
|
||||
mkStrLit = pure
|
||||
mkKeyword = pure
|
||||
mkEmpty = Nothing
|
||||
|
||||
instance IsToken Text where
|
||||
mkChar = Text.singleton
|
||||
mkSChar = Text.singleton
|
||||
mkPunct = Text.singleton
|
||||
mkText = id
|
||||
mkStrLit = id
|
||||
mkKeyword = id
|
||||
mkEmpty = ""
|
||||
|
||||
-- | Tokenize a text
|
||||
tokenize :: IsToken a => TokenizeSpec -> Text -> [a]
|
||||
tokenize s t = map tr t1
|
||||
where
|
||||
t1 = tokenize' s t
|
||||
tr (TChar c) = mkChar c
|
||||
tr (TSChar c) = mkSChar c
|
||||
tr (TText c) = mkText c
|
||||
tr (TSLit c) = mkStrLit c
|
||||
tr (TKeyword c) = mkKeyword c
|
||||
tr TEmpty = mkEmpty
|
||||
tr (TPunct c) = mkPunct c
|
||||
tr TDelim = mkDelim
|
||||
tr (TIndent n) = mkIndent n
|
||||
tr TEol = mkEol
|
||||
|
||||
execTokenizeM :: TokenizeM [Token] a -> TokenizeSpec -> [Token]
|
||||
execTokenizeM (TokenizeM m) spec =
|
||||
let (_,w) = execRWS m spec () in norm w
|
||||
|
||||
where norm x | justTrue (tsNotNormalize spec) = x
|
||||
| otherwise = normalize spec x
|
||||
|
||||
tokenize' :: TokenizeSpec -> Text -> [Token]
|
||||
tokenize' spec txt = execTokenizeM (root' txt) spec
|
||||
where
|
||||
|
||||
r = spec
|
||||
|
||||
noIndent = not doIndent
|
||||
doIndent = justTrue (tsIndent r)
|
||||
eolOk = justTrue (tsEol r)
|
||||
|
||||
root' x = scanIndent x >>= root
|
||||
|
||||
root ts = do
|
||||
|
||||
case Text.uncons ts of
|
||||
Nothing -> pure ()
|
||||
|
||||
Just ('\n', rest) | doIndent -> raiseEol >> root' rest
|
||||
Just (c, rest) | Set.member c (tsDelims r) -> tell [TDelim] >> root rest
|
||||
Just ('\'', rest) | justTrue (tsStringQ r) -> scanQ '\'' rest
|
||||
Just ('"', rest) | justTrue (tsStringQQ r) -> scanQ '"' rest
|
||||
|
||||
Just (c, rest) | Map.member c (tsLineComment r) -> scanComment (c,rest)
|
||||
|
||||
Just (c, rest) | Set.member c (tsPunct r) -> tell [TPunct c] >> root rest
|
||||
|
||||
Just (c, rest) | otherwise -> tell [TChar c] >> root rest
|
||||
|
||||
|
||||
raiseEol | eolOk = tell [TEol]
|
||||
| otherwise = pure ()
|
||||
|
||||
expandSpace ' ' = 1
|
||||
expandSpace '\t' = (fromMaybe 8 (tsItabStops r))
|
||||
expandSpace _ = 0
|
||||
|
||||
scanIndent x | noIndent = pure x
|
||||
| otherwise = do
|
||||
let (ss,as) = Text.span (\c -> c == ' ' || c == '\t') x
|
||||
tell [ TIndent (sum (map expandSpace (Text.unpack ss))) ]
|
||||
pure as
|
||||
|
||||
scanComment (c,rest) = do
|
||||
suff <- Map.lookup c <$> asks tsLineComment
|
||||
case suff of
|
||||
Just t | Text.isPrefixOf t rest -> do
|
||||
root $ Text.dropWhile ('\n' /=) rest
|
||||
|
||||
_ -> tell [TChar c] >> root rest
|
||||
|
||||
scanQ q ts = do
|
||||
|
||||
case Text.uncons ts of
|
||||
Nothing -> root ts
|
||||
|
||||
Just ('\\', rest) | justTrue (tsEsc r) -> unesc (scanQ q) rest
|
||||
| otherwise -> tell [tsChar '\\'] >> scanQ q rest
|
||||
|
||||
Just (c, rest) | c == q -> root rest
|
||||
| otherwise -> tell [tsChar c] >> scanQ q rest
|
||||
|
||||
unesc f ts =
|
||||
case Text.uncons ts of
|
||||
Nothing -> f ts
|
||||
Just ('"', rs) -> tell [tsChar '"' ] >> f rs
|
||||
Just ('\'', rs) -> tell [tsChar '\''] >> f rs
|
||||
Just ('\\', rs) -> tell [tsChar '\\'] >> f rs
|
||||
Just ('t', rs) -> tell [tsChar '\t'] >> f rs
|
||||
Just ('n', rs) -> tell [tsChar '\n'] >> f rs
|
||||
Just ('r', rs) -> tell [tsChar '\r'] >> f rs
|
||||
Just ('a', rs) -> tell [tsChar '\a'] >> f rs
|
||||
Just ('b', rs) -> tell [tsChar '\b'] >> f rs
|
||||
Just ('f', rs) -> tell [tsChar '\f'] >> f rs
|
||||
Just ('v', rs) -> tell [tsChar '\v'] >> f rs
|
||||
Just (_, rs) -> f rs
|
||||
|
||||
tsChar c | justTrue (tsNoSlits spec) = TChar c
|
||||
| otherwise = TSChar c
|
||||
|
||||
newtype NormStats = NormStats { nstatBeforeDelim :: Int }
|
||||
|
||||
normalize :: TokenizeSpec -> [Token] -> [Token]
|
||||
normalize spec tokens = snd $ execRWS (go tokens) () init
|
||||
where
|
||||
|
||||
go [] = addEmptyField
|
||||
|
||||
go s@(TIndent _ : _) = do
|
||||
let (iis, rest') = List.span isIndent s
|
||||
tell [TIndent (sum [k | TIndent k <- iis])]
|
||||
go rest'
|
||||
|
||||
go (TChar c0 : cs) = do
|
||||
let (n,ns) = List.span isTChar cs
|
||||
succStat
|
||||
let chunk = eatSpaces $ Text.pack (c0 : [ c | TChar c <- n])
|
||||
let kw = Set.member chunk (tsKeywords spec)
|
||||
tell [ if kw then TKeyword chunk else TText chunk ]
|
||||
go ns
|
||||
|
||||
go (TSChar x : xs) = do
|
||||
let (n,ns) = List.span isTSChar xs
|
||||
succStat
|
||||
tell [ TSLit $ Text.pack (x : [ c | TSChar c <- n]) ]
|
||||
go ns
|
||||
|
||||
go (TDelim : xs) = do
|
||||
addEmptyField
|
||||
pruneStat
|
||||
go xs
|
||||
|
||||
go (TPunct c : xs) = do
|
||||
tell [ TPunct c ]
|
||||
succStat
|
||||
go xs
|
||||
|
||||
go (x:xs) = tell [x] >> go xs
|
||||
|
||||
succStat = do
|
||||
modify (\x -> x { nstatBeforeDelim = succ (nstatBeforeDelim x)})
|
||||
|
||||
pruneStat = do
|
||||
modify (\x -> x { nstatBeforeDelim = 0 } )
|
||||
|
||||
addEmptyField = do
|
||||
ns <- gets nstatBeforeDelim
|
||||
when (ns == 0 && justTrue (tsAddEmptyFields spec) ) $ do
|
||||
tell [ TEmpty ]
|
||||
|
||||
isTChar (TChar _) = True
|
||||
isTChar _ = False
|
||||
|
||||
isTSChar (TSChar _) = True
|
||||
isTSChar _ = False
|
||||
|
||||
isIndent (TIndent _) = True
|
||||
isIndent _ = False
|
||||
|
||||
init = NormStats { nstatBeforeDelim = 0 }
|
||||
|
||||
eatSpaces s | sboth = Text.strip s
|
||||
| sLonly = Text.stripStart s
|
||||
| sRonly = Text.stripEnd s
|
||||
| sWU = (Text.unwords . Text.words) s
|
||||
| otherwise = s
|
||||
|
||||
where sboth = justTrue (tsStripLeft spec) && justTrue (tsStripRight spec)
|
||||
sLonly = justTrue (tsStripLeft spec) && not (justTrue (tsStripRight spec))
|
||||
sRonly = not (justTrue (tsStripLeft spec)) && justTrue (tsStripRight spec)
|
||||
sWU = justTrue (tsUW spec)
|
||||
|
|
@ -0,0 +1 @@
|
|||
module Data.Text.LSH where
|
|
@ -0,0 +1,150 @@
|
|||
{-# LANGUAGE OverloadedStrings
|
||||
, QuasiQuotes
|
||||
, ExtendedDefaultRules
|
||||
, LambdaCase
|
||||
, ImportQualifiedPost
|
||||
, DerivingStrategies
|
||||
, PatternSynonyms
|
||||
, ViewPatterns
|
||||
, MultiWayIf
|
||||
, TemplateHaskell
|
||||
#-}
|
||||
|
||||
module FuzzyParseSpec (spec) where
|
||||
|
||||
import Data.Text (Text)
|
||||
import Test.Hspec
|
||||
import Text.InterpolatedString.Perl6 (q)
|
||||
|
||||
import Data.Text.Fuzzy.Tokenize
|
||||
import Data.Data
|
||||
import Data.Generics.Uniplate.Data()
|
||||
import GHC.Generics
|
||||
|
||||
data TTok = TChar Char
|
||||
| TSChar Char
|
||||
| TPunct Char
|
||||
| TText Text
|
||||
| TStrLit Text
|
||||
| TKeyword Text
|
||||
| TEmpty
|
||||
| TIndent Int
|
||||
deriving stock (Eq,Ord,Show,Data,Generic)
|
||||
|
||||
instance IsToken TTok where
|
||||
mkChar = TChar
|
||||
mkSChar = TSChar
|
||||
mkPunct = TPunct
|
||||
mkText = TText
|
||||
mkStrLit = TStrLit
|
||||
mkKeyword = TKeyword
|
||||
mkEmpty = TEmpty
|
||||
mkIndent = TIndent
|
||||
|
||||
|
||||
|
||||
spec :: Spec
|
||||
spec = do
|
||||
describe "csv-like" $ do
|
||||
it "splits text using ':' delimeter" $ do
|
||||
let toks = tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
|
||||
toks `shouldBe` ["aaa "," bebeb "," qqq "]
|
||||
|
||||
it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
|
||||
let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
|
||||
toks `shouldBe` ["aaa "," bebeb "," qqq ","","","",""]
|
||||
|
||||
it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
|
||||
let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||
toks `shouldBe` [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||
|
||||
it "splits text using ':' delimeter with single-quotes string and empty fields with noslits" $ do
|
||||
let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
|
||||
let toks = tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||
toks `shouldBe` [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||
|
||||
it "splits text using ':' delimeter with single-quotes string and empty fields with noslits and uw" $ do
|
||||
let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
|
||||
let toks = tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||
toks `shouldBe` [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
|
||||
|
||||
it "uses punctuation tokens" $ do
|
||||
let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
|
||||
let toks = tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
|
||||
toks `shouldBe` ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
|
||||
|
||||
|
||||
it "tokenize simple lisp-like text with keywords" $ do
|
||||
let spec = delims " \n\t" <> comment ";"
|
||||
<> punct "{}()[]<>"
|
||||
<> sq <> sqq
|
||||
<> uw
|
||||
<> keywords ["define","apply","+"]
|
||||
|
||||
let code = [q|
|
||||
(define add (a b ) ; define simple function
|
||||
(+ a b) )
|
||||
(define r (add 10 20))
|
||||
|]
|
||||
|
||||
let toks = tokenize spec code :: [TTok]
|
||||
|
||||
let expected = [ TPunct '('
|
||||
, TKeyword "define"
|
||||
, TText "add" , TPunct '(', TText "a" , TText "b", TPunct ')'
|
||||
, TPunct '(', TKeyword "+", TText "a",TText "b",TPunct ')',TPunct ')'
|
||||
, TPunct '(',TKeyword "define"
|
||||
,TText "r"
|
||||
,TPunct '(',TText "add",TText "10",TText "20"
|
||||
,TPunct ')',TPunct ')']
|
||||
|
||||
toks `shouldBe` expected
|
||||
|
||||
|
||||
describe "Checks indentation support" $ do
|
||||
|
||||
let spec = delims " \n\t" <> comment ";"
|
||||
<> punct "{}()[]<>"
|
||||
<> sq <> sqq
|
||||
<> uw
|
||||
<> indent
|
||||
<> itabstops 8
|
||||
<> keywords ["define"]
|
||||
|
||||
|
||||
|
||||
it "parses some indented blocks" $ do
|
||||
|
||||
let expected = [ TIndent 0, TKeyword "define", TText "a", TText "0"
|
||||
, TIndent 2, TText "atom", TText "foo", TText "2"
|
||||
, TIndent 2, TKeyword "define", TText "aq", TText "2"
|
||||
, TIndent 4, TText "atom", TText "one", TText "4"
|
||||
, TIndent 4, TText "atom", TText "two", TText "4"
|
||||
, TIndent 0, TKeyword "define", TText "b", TText "0"
|
||||
, TIndent 2, TText "atom", TText "baar", TText "2"
|
||||
, TIndent 2, TText "atom", TText "quux", TText "2"
|
||||
, TIndent 2, TKeyword "define", TText "new", TText "2"
|
||||
, TIndent 6, TText "atom", TText "bar", TText "6"
|
||||
, TIndent 4, TText "atom", TText "fuu", TText "4"
|
||||
, TIndent 0
|
||||
]
|
||||
|
||||
let pyLike = [q|
|
||||
define a 0
|
||||
atom foo 2
|
||||
define aq 2
|
||||
atom one 4
|
||||
atom two 4
|
||||
|
||||
define b 0
|
||||
atom baar 2
|
||||
atom quux 2
|
||||
define new 2
|
||||
atom bar 6
|
||||
atom fuu 4
|
||||
|
||||
|]
|
||||
let toks = tokenize spec pyLike :: [TTok]
|
||||
toks `shouldBe` expected
|
||||
|
||||
|
|
@ -0,0 +1 @@
|
|||
{-# OPTIONS_GHC -F -pgmF hspec-discover #-}
|
Loading…
Reference in New Issue