mirror of https://github.com/voidlizard/hbs2
Squashed 'miscellaneous/fuzzy-parse/' content from commit a834b152e
git-subtree-dir: miscellaneous/fuzzy-parse git-subtree-split: a834b152e29d632c816eefe117036e5d9330bd03
This commit is contained in:
commit
cf85d2df2a
|
@ -0,0 +1,5 @@
|
||||||
|
if [ -f .envrc.local ]; then
|
||||||
|
source_env .envrc.local
|
||||||
|
fi
|
||||||
|
|
||||||
|
use flake
|
|
@ -0,0 +1,6 @@
|
||||||
|
*.swp
|
||||||
|
dist-newstyle/
|
||||||
|
Setup.hs
|
||||||
|
|
||||||
|
.direnv
|
||||||
|
.hbs2-git/
|
|
@ -0,0 +1,10 @@
|
||||||
|
# Revision history for fuzzy-parse
|
||||||
|
|
||||||
|
## 0.1.2.0
|
||||||
|
|
||||||
|
- Techical release
|
||||||
|
- Added some missed things
|
||||||
|
|
||||||
|
## 0.1.0.0 -- YYYY-mm-dd
|
||||||
|
|
||||||
|
* First version. Released on an unsuspecting world.
|
|
@ -0,0 +1,20 @@
|
||||||
|
Copyright (c) 2019 Dmitry Zuikov
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining
|
||||||
|
a copy of this software and associated documentation files (the
|
||||||
|
"Software"), to deal in the Software without restriction, including
|
||||||
|
without limitation the rights to use, copy, modify, merge, publish,
|
||||||
|
distribute, sublicense, and/or sell copies of the Software, and to
|
||||||
|
permit persons to whom the Software is furnished to do so, subject to
|
||||||
|
the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included
|
||||||
|
in all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||||
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||||
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||||
|
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@ -0,0 +1,131 @@
|
||||||
|
# About
|
||||||
|
|
||||||
|
# Data.Text.Fuzzy.Tokenize
|
||||||
|
|
||||||
|
The lightweight and multi-functional text tokenizer allowing different types of text tokenization
|
||||||
|
depending on it's settings.
|
||||||
|
|
||||||
|
It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
|
||||||
|
easier and sometimes faster than in case of usage mainstream parsing combinators or parser
|
||||||
|
generators.
|
||||||
|
|
||||||
|
The primary goal of this package is to parse unstructured text data, however it may be used for
|
||||||
|
parsing such data formats as CSV with ease.
|
||||||
|
|
||||||
|
Currently it supports the following types of entities: atoms, string literals (currently with the
|
||||||
|
minimal set of escaped characters), punctuation characters and delimeters.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Simple CSV-like tokenization
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
|
||||||
|
|
||||||
|
["aaa "," bebeb "," qqq "]
|
||||||
|
```
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
|
||||||
|
|
||||||
|
["aaa "," bebeb "," qqq ","","","",""]
|
||||||
|
```
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
|
||||||
|
|
||||||
|
[Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
```
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||||
|
|
||||||
|
[Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
```
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
|
||||||
|
tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||||
|
|
||||||
|
[Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
```
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
|
||||||
|
tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||||
|
|
||||||
|
[Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Primitive lisp-like language
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
{-# LANGUAGE QuasiQuotes, ExtendedDefaultRules #-}
|
||||||
|
|
||||||
|
import Text.InterpolatedString.Perl6 (q)
|
||||||
|
import Data.Text.Fuzzy.Tokenize
|
||||||
|
|
||||||
|
data TTok = TChar Char
|
||||||
|
| TSChar Char
|
||||||
|
| TPunct Char
|
||||||
|
| TText Text
|
||||||
|
| TStrLit Text
|
||||||
|
| TKeyword Text
|
||||||
|
| TEmpty
|
||||||
|
deriving(Eq,Ord,Show)
|
||||||
|
|
||||||
|
instance IsToken TTok where
|
||||||
|
mkChar = TChar
|
||||||
|
mkSChar = TSChar
|
||||||
|
mkPunct = TPunct
|
||||||
|
mkText = TText
|
||||||
|
mkStrLit = TStrLit
|
||||||
|
mkKeyword = TKeyword
|
||||||
|
mkEmpty = TEmpty
|
||||||
|
|
||||||
|
main = do
|
||||||
|
|
||||||
|
let spec = delims " \n\t" <> comment ";"
|
||||||
|
<> punct "{}()[]<>"
|
||||||
|
<> sq <> sqq
|
||||||
|
<> uw
|
||||||
|
<> keywords ["define","apply","+"]
|
||||||
|
let code = [q|
|
||||||
|
(define add (a b ) ; define simple function
|
||||||
|
(+ a b) )
|
||||||
|
(define r (add 10 20))
|
||||||
|
|]
|
||||||
|
let toks = tokenize spec code :: [TTok]
|
||||||
|
|
||||||
|
print toks
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
### About the delimeter tokens
|
||||||
|
This type of tokens appears during a "delimited" formats processing and disappears in results.
|
||||||
|
Currenly you will never see it unless normalization is turned off by 'nn' option.
|
||||||
|
|
||||||
|
The delimeters make sense in case of processing the CSV-like formats, but in this case you probably
|
||||||
|
need only values in results.
|
||||||
|
|
||||||
|
This behavior may be changed later. But right now delimeters seem pointless in results. If you
|
||||||
|
process some sort of grammar where delimeter character is important, you may use punctuation
|
||||||
|
instead, i.e:
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
|
||||||
|
tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
|
||||||
|
|
||||||
|
["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Other
|
||||||
|
For CSV-like formats it makes sense to split text to lines first, otherwise newline characters may
|
||||||
|
cause to weird results
|
||||||
|
|
||||||
|
|
||||||
|
# Authors
|
||||||
|
|
||||||
|
This library is written and maintained by Dmitry Zuikov, dzuikov@gmail.com
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
|
||||||
|
- [ ] TODO: Tests for Data.Text.Fuzzy.Section
|
||||||
|
- [ ] TODO: haddock for Data.Text.Fuzzy.Section
|
||||||
|
|
||||||
|
- [~] TODO: Tests
|
||||||
|
- [+] TODO: Freeze dependencies versions
|
||||||
|
- [ ] TODO: Version number
|
||||||
|
- [+] TODO: Haddocks
|
||||||
|
- [ ] TODO: Tokenizer: Identation support
|
||||||
|
- [ ] TODO: Tokenizer: Block comments support
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
packages: *.cabal
|
||||||
|
|
||||||
|
allow-newer: all
|
||||||
|
|
||||||
|
tests: True
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-utils": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1644229661,
|
||||||
|
"narHash": "sha256-1YdnJAsNy69bpcjuoKdOYQX0YxZBiCYZo4Twxerqv7k=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "3cecb5b042f7f209c56ffd8371b2711a290ec797",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"haskell-flake-utils": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-utils": "flake-utils"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1707809372,
|
||||||
|
"narHash": "sha256-wfTL9PlCSOqSSyU4eenFFI7pHrV21gba4GEILnI4nAU=",
|
||||||
|
"owner": "ivanovs-4",
|
||||||
|
"repo": "haskell-flake-utils",
|
||||||
|
"rev": "3cbdc5d6093e8b4464ae64097e0c8c61e4414ff2",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "ivanovs-4",
|
||||||
|
"repo": "haskell-flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1727089097,
|
||||||
|
"narHash": "sha256-ZMHMThPsthhUREwDebXw7GX45bJnBCVbfnH1g5iuSPc=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "568bfef547c14ca438c56a0bece08b8bb2b71a9c",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixpkgs-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"haskell-flake-utils": "haskell-flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
|
@ -0,0 +1,69 @@
|
||||||
|
{
|
||||||
|
description = "Haskell cabal package";
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||||
|
haskell-flake-utils.url = "github:ivanovs-4/haskell-flake-utils";
|
||||||
|
|
||||||
|
# another-simple-haskell-flake.url = "something";
|
||||||
|
|
||||||
|
# some-cabal-pkg.url = "github:example/some-cabal-pkg";
|
||||||
|
# some-cabal-pkg.flake = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs = { self, nixpkgs, haskell-flake-utils, ... }@inputs:
|
||||||
|
haskell-flake-utils.lib.simpleCabal2flake {
|
||||||
|
inherit self nixpkgs;
|
||||||
|
# systems = [ "x86_64-linux" ];
|
||||||
|
|
||||||
|
# DON'T FORGET TO PUT YOUR PACKAGE NAME HERE, REMOVING `throw`
|
||||||
|
name = "fuzzy-parse";
|
||||||
|
|
||||||
|
shellExtBuildInputs = {pkgs}: with pkgs; [
|
||||||
|
haskellPackages.haskell-language-server
|
||||||
|
];
|
||||||
|
|
||||||
|
# Wether to build hoogle in the default shell
|
||||||
|
shellWithHoogle = true;
|
||||||
|
|
||||||
|
|
||||||
|
## Optional parameters follow
|
||||||
|
|
||||||
|
# nixpkgs config
|
||||||
|
# config = { };
|
||||||
|
|
||||||
|
# Add another haskell flakes as requirements
|
||||||
|
# haskellFlakes = [ inputs.another-simple-haskell-flake ];
|
||||||
|
|
||||||
|
# Use this to load other flakes overlays to supplement nixpkgs
|
||||||
|
# preOverlays = [ ];
|
||||||
|
|
||||||
|
# Pass either a function or a file
|
||||||
|
# preOverlay = ./overlay.nix;
|
||||||
|
|
||||||
|
# Override haskell packages
|
||||||
|
# hpPreOverrides = { pkgs }: new: old:
|
||||||
|
# with pkgs.haskell.lib; with haskell-flake-utils.lib;
|
||||||
|
# tunePackages pkgs old {
|
||||||
|
# some-haskellPackages-package = [ dontHaddock ];
|
||||||
|
# } // {
|
||||||
|
# some-cabal-pkg = ((jailbreakUnbreak pkgs) (dontCheck (old.callCabal2nix "some-cabal-pkg" inputs.some-cabal-pkg {})));
|
||||||
|
# };
|
||||||
|
|
||||||
|
# Arguments for callCabal2nix
|
||||||
|
# cabal2nixArgs = {pkgs}: {
|
||||||
|
# };
|
||||||
|
|
||||||
|
# Maps to the devShell output. Pass in a shell.nix file or function
|
||||||
|
# shell = ./shell.nix
|
||||||
|
|
||||||
|
# Additional build intputs of the default shell
|
||||||
|
# shellExtBuildInputs = {pkgs}: with pkgs; [
|
||||||
|
# haskellPackages.haskell-language-server
|
||||||
|
# ];
|
||||||
|
|
||||||
|
# Wether to build hoogle in the default shell
|
||||||
|
# shellWithHoogle = true;
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
|
@ -0,0 +1,160 @@
|
||||||
|
cabal-version: 3.0
|
||||||
|
|
||||||
|
name: fuzzy-parse
|
||||||
|
version: 0.1.3.1
|
||||||
|
synopsis: Tools for processing unstructured text data
|
||||||
|
|
||||||
|
description:
|
||||||
|
The lightweight and easy to use functions for text tokenizing and parsing. It aimed for
|
||||||
|
parsing mostly unstructured data, but the structured formats may be parsed as well.
|
||||||
|
|
||||||
|
It may be used in different sutiations, for DSL, tex markups or even for parsing simple
|
||||||
|
grammars easier and sometimes faster than in case of usage mainstream parsing combinators
|
||||||
|
or parser generators.
|
||||||
|
|
||||||
|
See the README.markdown, examples and modules documentation for more.
|
||||||
|
|
||||||
|
license: MIT
|
||||||
|
license-file: LICENSE
|
||||||
|
author: Dmitry Zuikov
|
||||||
|
maintainer: dzuikov@gmail.com
|
||||||
|
|
||||||
|
category: Text, Parsing
|
||||||
|
extra-source-files: CHANGELOG.md
|
||||||
|
|
||||||
|
homepage: https://github.com/hexresearch/fuzzy-parse
|
||||||
|
bug-reports: https://github.com/hexresearch/fuzzy-parse/issues
|
||||||
|
|
||||||
|
extra-source-files:
|
||||||
|
README.markdown
|
||||||
|
|
||||||
|
common shared-properties
|
||||||
|
|
||||||
|
default-language: GHC2021
|
||||||
|
|
||||||
|
default-extensions:
|
||||||
|
ApplicativeDo
|
||||||
|
, BangPatterns
|
||||||
|
, BlockArguments
|
||||||
|
, ConstraintKinds
|
||||||
|
, DataKinds
|
||||||
|
, DeriveDataTypeable
|
||||||
|
, DeriveGeneric
|
||||||
|
, DerivingStrategies
|
||||||
|
, DerivingVia
|
||||||
|
, ExtendedDefaultRules
|
||||||
|
, FlexibleContexts
|
||||||
|
, FlexibleInstances
|
||||||
|
, GADTs
|
||||||
|
, GeneralizedNewtypeDeriving
|
||||||
|
, ImportQualifiedPost
|
||||||
|
, LambdaCase
|
||||||
|
, MultiParamTypeClasses
|
||||||
|
, OverloadedStrings
|
||||||
|
, QuasiQuotes
|
||||||
|
, RecordWildCards
|
||||||
|
, ScopedTypeVariables
|
||||||
|
, StandaloneDeriving
|
||||||
|
, TemplateHaskell
|
||||||
|
, TupleSections
|
||||||
|
, TypeApplications
|
||||||
|
, TypeFamilies
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
library
|
||||||
|
|
||||||
|
import: shared-properties
|
||||||
|
|
||||||
|
ghc-options:
|
||||||
|
-Wall
|
||||||
|
-fno-warn-type-defaults
|
||||||
|
-O2
|
||||||
|
"-with-rtsopts=-N4 -A64m -AL256m -I0"
|
||||||
|
|
||||||
|
|
||||||
|
exposed-modules: Data.Text.Fuzzy.Tokenize
|
||||||
|
, Data.Text.Fuzzy.Dates
|
||||||
|
, Data.Text.Fuzzy.Section
|
||||||
|
, Data.Text.Fuzzy.SExp
|
||||||
|
, Data.Text.Fuzzy.Attoparsec.Day
|
||||||
|
, Data.Text.Fuzzy.Attoparsec.Month
|
||||||
|
|
||||||
|
build-depends: base
|
||||||
|
, attoparsec
|
||||||
|
, containers
|
||||||
|
, mtl
|
||||||
|
, prettyprinter
|
||||||
|
, safe
|
||||||
|
, streaming
|
||||||
|
, scientific
|
||||||
|
, text
|
||||||
|
, time
|
||||||
|
, microlens-platform
|
||||||
|
, uniplate
|
||||||
|
, unliftio
|
||||||
|
, unordered-containers
|
||||||
|
, timeit
|
||||||
|
|
||||||
|
hs-source-dirs: src
|
||||||
|
|
||||||
|
|
||||||
|
executable fuzzy-sexp-parse
|
||||||
|
|
||||||
|
import: shared-properties
|
||||||
|
default-language: GHC2021
|
||||||
|
|
||||||
|
ghc-options:
|
||||||
|
-Wall
|
||||||
|
-fno-warn-type-defaults
|
||||||
|
-O2
|
||||||
|
|
||||||
|
|
||||||
|
main-is: FuzzySexpParse.hs
|
||||||
|
|
||||||
|
hs-source-dirs: misc
|
||||||
|
|
||||||
|
build-depends: base, fuzzy-parse
|
||||||
|
, containers
|
||||||
|
, hspec
|
||||||
|
, hspec-discover
|
||||||
|
, interpolatedstring-perl6
|
||||||
|
, text
|
||||||
|
, mtl
|
||||||
|
, streaming
|
||||||
|
, transformers
|
||||||
|
, exceptions
|
||||||
|
, uniplate
|
||||||
|
, microlens-platform
|
||||||
|
, safe
|
||||||
|
, timeit
|
||||||
|
, prettyprinter
|
||||||
|
|
||||||
|
|
||||||
|
test-suite fuzzy-parse-test
|
||||||
|
import: shared-properties
|
||||||
|
default-language: GHC2021
|
||||||
|
|
||||||
|
|
||||||
|
type: exitcode-stdio-1.0
|
||||||
|
main-is: Spec.hs
|
||||||
|
other-modules: FuzzyParseSpec
|
||||||
|
hs-source-dirs: test
|
||||||
|
build-depends: base, fuzzy-parse
|
||||||
|
, containers
|
||||||
|
, hspec
|
||||||
|
, hspec-discover
|
||||||
|
, interpolatedstring-perl6
|
||||||
|
, text
|
||||||
|
, mtl
|
||||||
|
, streaming
|
||||||
|
, transformers
|
||||||
|
, exceptions
|
||||||
|
, uniplate
|
||||||
|
, microlens-platform
|
||||||
|
, safe
|
||||||
|
, timeit
|
||||||
|
|
||||||
|
build-tool-depends: hspec-discover:hspec-discover == 2.*
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
module Main where
|
||||||
|
|
||||||
|
import Data.Text.Fuzzy.SExp
|
||||||
|
import Data.Text.IO qualified as IO
|
||||||
|
import Data.Text qualified as Text
|
||||||
|
import Data.Either
|
||||||
|
import System.TimeIt
|
||||||
|
import Control.Monad.Except
|
||||||
|
import Data.Functor
|
||||||
|
import Data.Function
|
||||||
|
import Data.Fixed
|
||||||
|
import Prettyprinter
|
||||||
|
import System.IO
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
s <- IO.getContents
|
||||||
|
|
||||||
|
(tt,toks) <- timeItT do
|
||||||
|
pure (tokenizeSexp s)
|
||||||
|
|
||||||
|
(pt,top) <- timeItT do
|
||||||
|
runExceptT (parseTop @() s) <&> either (error.show) id
|
||||||
|
|
||||||
|
print (vcat (fmap pretty top))
|
||||||
|
|
||||||
|
hPrint stderr $ pretty (Text.length s) <+> "chars, parsed in" <+> viaShow (realToFrac pt :: Fixed E6)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
{
|
||||||
|
"url": "https://github.com/NixOS/nixpkgs.git",
|
||||||
|
"rev": "5cf0de2485efeccc307692eedadbb2d9bfdc7013",
|
||||||
|
"date": "2020-06-04T17:30:37+08:00",
|
||||||
|
"sha256": "07axrr50nlmnvba5ja2ihzjwczi66znak57bhcz472w22w7m3sd1",
|
||||||
|
"fetchSubmodules": false
|
||||||
|
}
|
|
@ -0,0 +1,5 @@
|
||||||
|
import ((import <nixpkgs> {}).fetchFromGitHub {
|
||||||
|
owner = "NixOS";
|
||||||
|
repo = "nixpkgs";
|
||||||
|
inherit (builtins.fromJSON (builtins.readFile ./pkgs.json)) rev sha256;
|
||||||
|
})
|
|
@ -0,0 +1,29 @@
|
||||||
|
let
|
||||||
|
pkgs = import ./pkgs.nix { inherit config;
|
||||||
|
};
|
||||||
|
lib = pkgs.haskell.lib;
|
||||||
|
config = {
|
||||||
|
packageOverrides = pkgs: rec {
|
||||||
|
haskellPackages = pkgs.haskellPackages.override { overrides = haskOverrides; };
|
||||||
|
};
|
||||||
|
};
|
||||||
|
gitignore = pkgs.callPackage (pkgs.fetchFromGitHub {
|
||||||
|
owner = "siers";
|
||||||
|
repo = "nix-gitignore";
|
||||||
|
rev = "ce0778ddd8b1f5f92d26480c21706b51b1af9166";
|
||||||
|
sha256 = "1d7ab78i2k13lffskb23x8b5h24x7wkdmpvmria1v3wb9pcpkg2w";
|
||||||
|
}) {};
|
||||||
|
ignore = gitignore.gitignoreSourceAux ''
|
||||||
|
.stack-work
|
||||||
|
dist
|
||||||
|
dist-newstyle
|
||||||
|
.ghc.environment*
|
||||||
|
'';
|
||||||
|
haskOverrides = new: old:
|
||||||
|
let overrides = lib.packagesFromDirectory { directory = ./derivations; } new old;
|
||||||
|
in overrides;
|
||||||
|
in rec {
|
||||||
|
inherit pkgs;
|
||||||
|
packages = { inherit (pkgs.haskellPackages) fuzzy-parse;
|
||||||
|
};
|
||||||
|
}
|
|
@ -0,0 +1,3 @@
|
||||||
|
#!/usr/bin/env nix-shell
|
||||||
|
#! nix-shell -i bash -p nix-prefetch-git
|
||||||
|
nix-prefetch-git https://github.com/NixOS/nixpkgs.git | tee pkgs.json
|
|
@ -0,0 +1,10 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -e
|
||||||
|
|
||||||
|
dir=$(mktemp -d dist-docs.XXXXXX)
|
||||||
|
trap 'rm -r "$dir"' EXIT
|
||||||
|
|
||||||
|
# assumes cabal 2.4 or later
|
||||||
|
cabal v2-haddock --builddir="$dir" --haddock-for-hackage --enable-doc
|
||||||
|
|
||||||
|
cabal upload -d --publish $dir/*-docs.tar.gz
|
|
@ -0,0 +1,97 @@
|
||||||
|
module Data.Text.Fuzzy.Attoparsec.Day ( dayDMY
|
||||||
|
, dayYMD
|
||||||
|
, dayYYYYMMDD
|
||||||
|
, dayDMonY
|
||||||
|
, day
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.List (zipWith)
|
||||||
|
import Control.Applicative ((<|>))
|
||||||
|
import Data.Attoparsec.Text (Parser,decimal,digit,count,satisfy,inClass,skipWhile)
|
||||||
|
import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
|
||||||
|
import qualified Data.Char as Char
|
||||||
|
import qualified Data.Text as Text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
day :: Parser Day
|
||||||
|
day = dayDMonY <|> dayYYYYMMDD <|> dayYMD <|> dayDMY
|
||||||
|
|
||||||
|
skipDelim :: Parser ()
|
||||||
|
skipDelim = skipWhile (inClass " ./-")
|
||||||
|
|
||||||
|
dayDMY :: Parser Day
|
||||||
|
dayDMY = do
|
||||||
|
d <- decimal :: Parser Int
|
||||||
|
skipDelim
|
||||||
|
m <- decimal :: Parser Int
|
||||||
|
skipDelim
|
||||||
|
y' <- decimal :: Parser Integer
|
||||||
|
maybe (fail "bad date format") pure (makeDay y' m d)
|
||||||
|
|
||||||
|
dayYMD :: Parser Day
|
||||||
|
dayYMD = do
|
||||||
|
y' <- decimal :: Parser Integer
|
||||||
|
skipDelim
|
||||||
|
m <- decimal :: Parser Int
|
||||||
|
skipDelim
|
||||||
|
d <- decimal :: Parser Int
|
||||||
|
maybe (fail "bad date format") pure (makeDay y' m d)
|
||||||
|
|
||||||
|
dayYYYYMMDD :: Parser Day
|
||||||
|
dayYYYYMMDD = do
|
||||||
|
y <- fromIntegral . num n4 . map o <$> count 4 digit
|
||||||
|
m <- num n2 . map o <$> count 2 digit
|
||||||
|
d <- num n2 . map o <$> count 2 digit
|
||||||
|
maybe (fail "bad date format") pure (makeDay y m d)
|
||||||
|
where n4 = [1000,100,10,1]
|
||||||
|
n2 = [10,1]
|
||||||
|
o x = Char.ord x - Char.ord '0'
|
||||||
|
num n x = sum $ zipWith (*) x n
|
||||||
|
|
||||||
|
dayDMonY :: Parser Day
|
||||||
|
dayDMonY = do
|
||||||
|
d <- decimal :: Parser Int
|
||||||
|
skipDelim
|
||||||
|
m <- pMon
|
||||||
|
skipDelim
|
||||||
|
y <- decimal :: Parser Integer
|
||||||
|
maybe (fail "bad date format") pure (makeDay y m d)
|
||||||
|
where
|
||||||
|
pMon :: Parser Int
|
||||||
|
pMon = do
|
||||||
|
txt <- Text.toUpper . Text.pack <$> count 3 (satisfy Char.isLetter)
|
||||||
|
case txt of
|
||||||
|
"JAN" -> pure 1
|
||||||
|
"FEB" -> pure 2
|
||||||
|
"MAR" -> pure 3
|
||||||
|
"APR" -> pure 4
|
||||||
|
"MAY" -> pure 5
|
||||||
|
"JUN" -> pure 6
|
||||||
|
"JUL" -> pure 7
|
||||||
|
"AUG" -> pure 8
|
||||||
|
"SEP" -> pure 9
|
||||||
|
"OCT" -> pure 10
|
||||||
|
"NOV" -> pure 11
|
||||||
|
"DEC" -> pure 12
|
||||||
|
_ -> fail "bad month name"
|
||||||
|
|
||||||
|
|
||||||
|
makeYear :: Integer -> Maybe Integer
|
||||||
|
makeYear y' = if y < 1900 && y' < 99
|
||||||
|
then Nothing
|
||||||
|
else pure y
|
||||||
|
where
|
||||||
|
y = if y' < 50
|
||||||
|
then y' + 2000
|
||||||
|
else (if y' >= 50 && y' <= 99
|
||||||
|
then y' + 1900
|
||||||
|
else y' )
|
||||||
|
|
||||||
|
makeDay :: Integer -> Int -> Int -> Maybe Day
|
||||||
|
makeDay y m d | m <= 12 && m > 0 =
|
||||||
|
makeYear y >>= \yyyy -> if d <= gregorianMonthLength yyyy m
|
||||||
|
then pure $ fromGregorian yyyy m d
|
||||||
|
else Nothing
|
||||||
|
|
||||||
|
| otherwise = Nothing
|
|
@ -0,0 +1,46 @@
|
||||||
|
module Data.Text.Fuzzy.Attoparsec.Month ( fuzzyMonth, fuzzyMonthFromText
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative ((<|>))
|
||||||
|
import Data.Attoparsec.Text (Parser,decimal,digit,letter,many1,parseOnly)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import Data.Maybe
|
||||||
|
import Data.Text (Text)
|
||||||
|
import Data.Time.Calendar (Day,fromGregorian,gregorianMonthLength)
|
||||||
|
import qualified Data.Char as Char
|
||||||
|
import qualified Data.Map as Map
|
||||||
|
import qualified Data.Text as Text
|
||||||
|
|
||||||
|
|
||||||
|
fuzzyMonth :: Parser Int
|
||||||
|
fuzzyMonth = pMonthNum <|> pMonth
|
||||||
|
|
||||||
|
fuzzyMonthFromText :: Text -> Maybe Int
|
||||||
|
fuzzyMonthFromText = either (const Nothing) Just . parseOnly fuzzyMonth
|
||||||
|
|
||||||
|
pMonthNum :: Parser Int
|
||||||
|
pMonthNum = do
|
||||||
|
n <- decimal
|
||||||
|
if n >= 1 && n <= 13
|
||||||
|
then pure n
|
||||||
|
else fail "invalid months num"
|
||||||
|
|
||||||
|
pMonth :: Parser Int
|
||||||
|
pMonth = do
|
||||||
|
mo <- many1 (Char.toLower <$> letter)
|
||||||
|
maybe (fail "invalid month name") pure (Map.lookup mo months)
|
||||||
|
where
|
||||||
|
months :: Map String Int
|
||||||
|
months = Map.fromList [ ("jan", 1), ("january" , 1)
|
||||||
|
, ("feb", 2), ("febuary" , 2)
|
||||||
|
, ("mar", 3), ("march" , 3)
|
||||||
|
, ("apr", 4), ("april" , 4)
|
||||||
|
, ("may", 5), ("may" , 5)
|
||||||
|
, ("jun", 6), ("june" , 6)
|
||||||
|
, ("jul", 7), ("july" , 7)
|
||||||
|
, ("aug", 8), ("august" , 8)
|
||||||
|
, ("sep", 9), ("september", 9)
|
||||||
|
, ("oct", 10), ("october" , 10)
|
||||||
|
, ("nov", 11), ("november" , 11)
|
||||||
|
, ("dec", 12), ("december" , 12)
|
||||||
|
]
|
|
@ -0,0 +1,46 @@
|
||||||
|
-- |
|
||||||
|
-- Module : Data.Text.Fuzzy.Dates
|
||||||
|
-- Copyright : Dmitry Zuikov 2020
|
||||||
|
-- License : MIT
|
||||||
|
--
|
||||||
|
-- Maintainer : dzuikov@gmail.com
|
||||||
|
-- Stability : experimental
|
||||||
|
-- Portability : unknown
|
||||||
|
--
|
||||||
|
-- Dates fuzzy parsing.
|
||||||
|
-- Supports a number of dates format and tries to recover
|
||||||
|
-- the incomplete dates from text, with use of some
|
||||||
|
-- reasonable assumptions. Does not support locales,
|
||||||
|
-- i.e assums only English for dates yet.
|
||||||
|
--
|
||||||
|
-- == Examples
|
||||||
|
--
|
||||||
|
-- > parseMaybeDay "01.01.1979"
|
||||||
|
-- > Just 1979-01-01
|
||||||
|
-- > parseMaybeDay "01.01.01"
|
||||||
|
-- > Just 2001-01-01
|
||||||
|
-- > parseMaybeDay "13/01/2019"
|
||||||
|
-- > Just 2019-01-13
|
||||||
|
-- > parseMaybeDay "2019-12-1"
|
||||||
|
-- > Just 2019-12-01
|
||||||
|
-- > parseMaybeDay "21-feb-79"
|
||||||
|
-- > Just 1979-02-21
|
||||||
|
-- > parseMaybeDay "21-feb-01"
|
||||||
|
-- > Just 2001-02-21
|
||||||
|
-- > parseMaybeDay "29feb04"
|
||||||
|
-- > Just 2004-02-29
|
||||||
|
-- > parseMaybeDay "21feb28"
|
||||||
|
-- > Just 2028-02-21
|
||||||
|
|
||||||
|
module Data.Text.Fuzzy.Dates where
|
||||||
|
|
||||||
|
import Data.Attoparsec.Text (parseOnly)
|
||||||
|
import Data.Either (either)
|
||||||
|
import Data.Text.Fuzzy.Attoparsec.Day
|
||||||
|
import Data.Text (Text)
|
||||||
|
import Data.Time.Calendar
|
||||||
|
|
||||||
|
-- | Tries to parse a date from the text.
|
||||||
|
parseMaybeDay :: Text -> Maybe Day
|
||||||
|
parseMaybeDay s = either (const Nothing) pure (parseOnly day s)
|
||||||
|
|
|
@ -0,0 +1,378 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE QuasiQuotes #-}
|
||||||
|
{-# LANGUAGE ExtendedDefaultRules #-}
|
||||||
|
{-# LANGUAGE LambdaCase #-}
|
||||||
|
{-# LANGUAGE ImportQualifiedPost #-}
|
||||||
|
{-# LANGUAGE DerivingStrategies #-}
|
||||||
|
{-# LANGUAGE PatternSynonyms #-}
|
||||||
|
{-# LANGUAGE ViewPatterns #-}
|
||||||
|
{-# LANGUAGE TemplateHaskell #-}
|
||||||
|
module Data.Text.Fuzzy.SExp where
|
||||||
|
|
||||||
|
import Data.Text (Text)
|
||||||
|
|
||||||
|
import Control.Applicative
|
||||||
|
import Control.Monad
|
||||||
|
import Data.Function
|
||||||
|
import Data.Functor
|
||||||
|
import Data.Text.Fuzzy.Tokenize
|
||||||
|
import Control.Monad.Reader
|
||||||
|
import Data.Typeable
|
||||||
|
import Control.Monad.Except
|
||||||
|
import Control.Monad.RWS
|
||||||
|
import Data.Maybe
|
||||||
|
import Data.Char (isSpace,digitToInt)
|
||||||
|
import Data.Generics.Uniplate.Data()
|
||||||
|
import Safe
|
||||||
|
import Data.Data
|
||||||
|
import GHC.Generics
|
||||||
|
import Lens.Micro.Platform
|
||||||
|
import Data.Text qualified as Text
|
||||||
|
import Data.Coerce
|
||||||
|
import Data.Scientific
|
||||||
|
|
||||||
|
import Data.HashMap.Strict (HashMap)
|
||||||
|
import Data.HashMap.Strict qualified as HM
|
||||||
|
|
||||||
|
import Prettyprinter hiding (braces,list)
|
||||||
|
|
||||||
|
|
||||||
|
import Streaming.Prelude qualified as S
|
||||||
|
|
||||||
|
data TTok = TChar Char
|
||||||
|
| TSChar Char
|
||||||
|
| TPunct Char
|
||||||
|
| TText Text
|
||||||
|
| TStrLit Text
|
||||||
|
| TKeyword Text
|
||||||
|
| TEmpty
|
||||||
|
| TIndent Int
|
||||||
|
deriving stock (Eq,Ord,Show,Data,Generic)
|
||||||
|
|
||||||
|
instance IsToken TTok where
|
||||||
|
mkChar = TChar
|
||||||
|
mkSChar = TSChar
|
||||||
|
mkPunct = TPunct
|
||||||
|
mkText = TText
|
||||||
|
mkStrLit = TStrLit
|
||||||
|
mkKeyword = TKeyword
|
||||||
|
mkEmpty = TEmpty
|
||||||
|
mkIndent = TIndent
|
||||||
|
|
||||||
|
newtype C0 = C0 (Maybe Int)
|
||||||
|
deriving stock (Eq,Ord,Show,Data,Typeable,Generic)
|
||||||
|
|
||||||
|
data SExpParseError =
|
||||||
|
ParensOver C0
|
||||||
|
| ParensUnder C0
|
||||||
|
| ParensUnmatched C0
|
||||||
|
| SyntaxError C0
|
||||||
|
deriving stock (Show,Typeable)
|
||||||
|
|
||||||
|
|
||||||
|
data NumType =
|
||||||
|
NumInteger Integer
|
||||||
|
| NumDouble Scientific
|
||||||
|
deriving stock (Eq,Ord,Show,Data,Generic)
|
||||||
|
|
||||||
|
class Monoid c => ForMicroSexp c where
|
||||||
|
|
||||||
|
instance Monoid C0 where
|
||||||
|
mempty = C0 Nothing
|
||||||
|
|
||||||
|
instance Semigroup C0 where
|
||||||
|
(<>) (C0 a) (C0 b) = C0 (b <|> a)
|
||||||
|
|
||||||
|
instance ForMicroSexp C0 where
|
||||||
|
|
||||||
|
|
||||||
|
instance ForMicroSexp () where
|
||||||
|
|
||||||
|
data MicroSexp c =
|
||||||
|
List_ c [MicroSexp c]
|
||||||
|
| Symbol_ c Text
|
||||||
|
| String_ c Text
|
||||||
|
| Number_ c NumType
|
||||||
|
| Boolean_ c Bool
|
||||||
|
deriving stock (Show,Data,Generic)
|
||||||
|
|
||||||
|
pattern List :: ForMicroSexp c => [MicroSexp c] -> MicroSexp c
|
||||||
|
pattern List xs <- List_ _ xs where
|
||||||
|
List xs = List_ mempty xs
|
||||||
|
|
||||||
|
pattern Symbol :: ForMicroSexp c => Text -> MicroSexp c
|
||||||
|
pattern Symbol xs <- Symbol_ _ xs where
|
||||||
|
Symbol xs = Symbol_ mempty xs
|
||||||
|
|
||||||
|
pattern String :: ForMicroSexp c => Text -> MicroSexp c
|
||||||
|
pattern String x <- String_ _ x where
|
||||||
|
String x = String_ mempty x
|
||||||
|
|
||||||
|
pattern Number :: ForMicroSexp c => NumType -> MicroSexp c
|
||||||
|
pattern Number n <- Number_ _ n where
|
||||||
|
Number n = Number_ mempty n
|
||||||
|
|
||||||
|
pattern Boolean :: ForMicroSexp c => Bool -> MicroSexp c
|
||||||
|
pattern Boolean b <- Boolean_ _ b where
|
||||||
|
Boolean b = Boolean_ mempty b
|
||||||
|
|
||||||
|
{-# COMPLETE List, Symbol, String, Number, Boolean #-}
|
||||||
|
|
||||||
|
|
||||||
|
contextOf :: Lens (MicroSexp c) (MicroSexp c) c c
|
||||||
|
contextOf = lens g s
|
||||||
|
where
|
||||||
|
s sexp c = case sexp of
|
||||||
|
List_ _ a -> List_ c a
|
||||||
|
Symbol_ _ a -> Symbol_ c a
|
||||||
|
String_ _ a -> String_ c a
|
||||||
|
Number_ _ a -> Number_ c a
|
||||||
|
Boolean_ _ a -> Boolean_ c a
|
||||||
|
|
||||||
|
g = \case
|
||||||
|
List_ c _ -> c
|
||||||
|
Symbol_ c _ -> c
|
||||||
|
String_ c _ -> c
|
||||||
|
Number_ c _ -> c
|
||||||
|
Boolean_ c _ -> c
|
||||||
|
|
||||||
|
nil :: forall c . ForMicroSexp c => MicroSexp c
|
||||||
|
nil = List []
|
||||||
|
|
||||||
|
symbol :: forall c . ForMicroSexp c => Text -> MicroSexp c
|
||||||
|
symbol = Symbol
|
||||||
|
|
||||||
|
str :: forall c . ForMicroSexp c => Text -> MicroSexp c
|
||||||
|
str = String
|
||||||
|
|
||||||
|
newtype SExpEnv =
|
||||||
|
SExpEnv
|
||||||
|
{ sexpTranslate :: Bool
|
||||||
|
}
|
||||||
|
|
||||||
|
data SExpState =
|
||||||
|
SExpState
|
||||||
|
{ _sexpLno :: Int
|
||||||
|
, _sexpBraces :: [Char]
|
||||||
|
}
|
||||||
|
|
||||||
|
makeLenses 'SExpState
|
||||||
|
|
||||||
|
defEnv :: SExpEnv
|
||||||
|
defEnv = SExpEnv True
|
||||||
|
|
||||||
|
newtype SExpM m a = SExpM { fromSexpM :: RWST SExpEnv () SExpState m a }
|
||||||
|
deriving newtype
|
||||||
|
( Applicative
|
||||||
|
, Functor
|
||||||
|
, Monad
|
||||||
|
, MonadState SExpState
|
||||||
|
, MonadReader SExpEnv
|
||||||
|
, MonadTrans
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
instance MonadError SExpParseError m => MonadError SExpParseError (SExpM m) where
|
||||||
|
throwError = lift . throwError
|
||||||
|
catchError w = catchError (coerce $ fromSexpM w)
|
||||||
|
|
||||||
|
tokenizeSexp :: Text -> [TTok]
|
||||||
|
tokenizeSexp txt = do
|
||||||
|
let spec = delims " \r\t" <> comment ";"
|
||||||
|
<> punct "'{}()[]\n"
|
||||||
|
<> sqq
|
||||||
|
<> uw
|
||||||
|
tokenize spec txt
|
||||||
|
|
||||||
|
runSexpM :: Monad m => SExpM m a -> m a
|
||||||
|
runSexpM f = evalRWST (fromSexpM f) defEnv (SExpState 0 []) <&> fst
|
||||||
|
|
||||||
|
|
||||||
|
parseSexp :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m (MicroSexp c)
|
||||||
|
parseSexp txt = do
|
||||||
|
(s, _) <- runSexpM do
|
||||||
|
(s,rest) <- sexp (tokenizeSexp txt)
|
||||||
|
checkBraces
|
||||||
|
pure (s,rest)
|
||||||
|
|
||||||
|
pure s
|
||||||
|
|
||||||
|
checkBraces :: (MonadError SExpParseError m) => SExpM m ()
|
||||||
|
checkBraces = do
|
||||||
|
braces <- gets (view sexpBraces)
|
||||||
|
unless (null braces) $ raiseWith ParensUnder
|
||||||
|
|
||||||
|
succLno :: (MonadError SExpParseError m) => SExpM m ()
|
||||||
|
succLno = modify (over sexpLno succ)
|
||||||
|
|
||||||
|
parseTop :: (ForMicroSexp c, MonadError SExpParseError m) => Text -> m [MicroSexp c]
|
||||||
|
parseTop txt = do
|
||||||
|
let tokens = tokenizeSexp txt
|
||||||
|
S.toList_ $ runSexpM do
|
||||||
|
flip fix (mempty,tokens) $ \next -> \case
|
||||||
|
(acc, []) -> do
|
||||||
|
emit acc
|
||||||
|
(acc, TPunct '\n' : rest) -> do
|
||||||
|
succLno
|
||||||
|
emit acc
|
||||||
|
next (mempty,rest)
|
||||||
|
(acc, rest) -> do
|
||||||
|
(s, xs) <- sexp rest
|
||||||
|
next (acc <> [s],xs)
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
emit [] = pure ()
|
||||||
|
emit wtf = case wtf of
|
||||||
|
[List one] -> lift $ S.yield (List one)
|
||||||
|
xs -> lift $ S.yield (List xs)
|
||||||
|
|
||||||
|
sexp :: (ForMicroSexp c, MonadError SExpParseError m) => [TTok] -> SExpM m (MicroSexp c, [TTok])
|
||||||
|
sexp s = case s of
|
||||||
|
[] -> do
|
||||||
|
checkBraces
|
||||||
|
pure (nil, mempty)
|
||||||
|
|
||||||
|
(TText l : w) -> (,w) <$> trNum (Symbol l)
|
||||||
|
|
||||||
|
(TStrLit l : w) -> pure (String l, w)
|
||||||
|
|
||||||
|
-- so far ignored
|
||||||
|
(TPunct '\'' : rest) -> sexp rest
|
||||||
|
|
||||||
|
(TPunct '\n' : rest) -> succLno >> sexp rest
|
||||||
|
|
||||||
|
(TPunct c : rest) | isSpace c -> sexp rest
|
||||||
|
|
||||||
|
(TPunct c : rest) | isBrace c ->
|
||||||
|
maybe (pure (nil, rest)) (`list` rest) (closing c)
|
||||||
|
| otherwise -> do
|
||||||
|
raiseWith ParensOver
|
||||||
|
|
||||||
|
( _ : _ ) -> raiseWith SyntaxError
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
setContext w = do
|
||||||
|
co <- getC0
|
||||||
|
pure $ over _2 (set contextOf co) w
|
||||||
|
|
||||||
|
isBrace :: Char -> Bool
|
||||||
|
isBrace c = HM.member c braces
|
||||||
|
|
||||||
|
closing :: Char -> Maybe Char
|
||||||
|
closing c = HM.lookup c braces
|
||||||
|
|
||||||
|
braces :: HashMap Char Char
|
||||||
|
braces = HM.fromList[ ('{', '}')
|
||||||
|
, ('(', ')')
|
||||||
|
, ('[', ']')
|
||||||
|
, ('<', '>')
|
||||||
|
]
|
||||||
|
|
||||||
|
cBraces :: [Char]
|
||||||
|
cBraces = HM.elems braces
|
||||||
|
|
||||||
|
trNum tok = do
|
||||||
|
|
||||||
|
trans <- asks sexpTranslate
|
||||||
|
|
||||||
|
case tok of
|
||||||
|
Symbol s | trans -> do
|
||||||
|
let s0 = Text.unpack s
|
||||||
|
|
||||||
|
let what = Number . NumInteger <$> readMay @Integer s0
|
||||||
|
<|>
|
||||||
|
Number . NumInteger <$> parseBinary s0
|
||||||
|
<|>
|
||||||
|
Number . NumDouble <$> readMay @Scientific s0
|
||||||
|
<|>
|
||||||
|
( case s of
|
||||||
|
"#t" -> Just (Boolean True)
|
||||||
|
"#f" -> Just (Boolean False)
|
||||||
|
_ -> Nothing
|
||||||
|
)
|
||||||
|
|
||||||
|
pure $ fromMaybe (Symbol s) what
|
||||||
|
|
||||||
|
|
||||||
|
x -> pure x
|
||||||
|
{-# INLINE trNum #-}
|
||||||
|
|
||||||
|
list :: (ForMicroSexp c, MonadError SExpParseError m)
|
||||||
|
=> Char
|
||||||
|
-> [TTok]
|
||||||
|
-> SExpM m (MicroSexp c, [TTok])
|
||||||
|
|
||||||
|
list _ [] = raiseWith ParensUnder
|
||||||
|
|
||||||
|
list cb tokens = do
|
||||||
|
modify $ over sexpBraces (cb:)
|
||||||
|
|
||||||
|
go cb mempty tokens
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
isClosingFor :: Char -> Bool
|
||||||
|
isClosingFor c = c `elem` cBraces
|
||||||
|
|
||||||
|
go _ _ [] = do
|
||||||
|
checkBraces
|
||||||
|
pure (List mempty, mempty)
|
||||||
|
|
||||||
|
go cl acc (TPunct c : rest) | isSpace c = do
|
||||||
|
go cl acc rest
|
||||||
|
|
||||||
|
go cl acc (TPunct c : rest)
|
||||||
|
| isClosingFor c && c == cl = do
|
||||||
|
modify $ over sexpBraces (drop 1)
|
||||||
|
pure (List (reverse acc), rest)
|
||||||
|
|
||||||
|
| isClosingFor c && c /= cl = do
|
||||||
|
raiseWith ParensUnmatched
|
||||||
|
-- throwError =<< ParensUnmatched <$> undefined
|
||||||
|
|
||||||
|
go cl acc rest = do
|
||||||
|
(e,r) <- sexp rest
|
||||||
|
go cl (e : acc) r
|
||||||
|
|
||||||
|
|
||||||
|
getC0 :: Monad m => SExpM m C0
|
||||||
|
getC0 = do
|
||||||
|
lno <- gets (view sexpLno)
|
||||||
|
pure (C0 (Just lno))
|
||||||
|
|
||||||
|
raiseWith :: (MonadError SExpParseError m)
|
||||||
|
=> (C0 -> SExpParseError) -> SExpM m b
|
||||||
|
|
||||||
|
raiseWith a = throwError =<< a <$> getC0
|
||||||
|
|
||||||
|
instance Pretty NumType where
|
||||||
|
pretty = \case
|
||||||
|
NumInteger n -> pretty n
|
||||||
|
NumDouble n -> viaShow n
|
||||||
|
|
||||||
|
instance ForMicroSexp c => Pretty (MicroSexp c) where
|
||||||
|
|
||||||
|
pretty = \case
|
||||||
|
List xs -> parens (hsep (fmap pretty xs))
|
||||||
|
String s -> dquotes (pretty s)
|
||||||
|
Symbol s -> pretty s
|
||||||
|
Number n -> pretty n
|
||||||
|
Boolean True -> pretty "#t"
|
||||||
|
Boolean False -> pretty "#f"
|
||||||
|
|
||||||
|
isBinaryDigit :: Char -> Bool
|
||||||
|
isBinaryDigit c = c == '0' || c == '1'
|
||||||
|
|
||||||
|
parseBinary :: String -> Maybe Integer
|
||||||
|
parseBinary str =
|
||||||
|
let
|
||||||
|
withoutPrefix = case str of
|
||||||
|
'0':'b':rest -> Just rest
|
||||||
|
'0':'B':rest -> Just rest
|
||||||
|
_ -> Nothing
|
||||||
|
in if isJust withoutPrefix && all isBinaryDigit (fromJust withoutPrefix)
|
||||||
|
then Just $ foldl (\acc x -> acc * 2 + toInteger (digitToInt x)) 0 (fromJust withoutPrefix)
|
||||||
|
else Nothing
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
module Data.Text.Fuzzy.Section (cutSectionBy, cutSectionOn) where
|
||||||
|
|
||||||
|
import Data.Text (Text)
|
||||||
|
import qualified Data.List as List
|
||||||
|
|
||||||
|
|
||||||
|
cutSectionOn :: Text -> Text -> [Text] -> [Text]
|
||||||
|
cutSectionOn a b txt = cutSectionBy ((==)a) ((==b)) txt
|
||||||
|
|
||||||
|
cutSectionBy :: (Text -> Bool) -> (Text -> Bool) -> [Text] -> [Text]
|
||||||
|
cutSectionBy a b txt = cutI
|
||||||
|
where
|
||||||
|
cutC = List.dropWhile (not . a) txt
|
||||||
|
cutI = List.takeWhile (not . b) cutC
|
||||||
|
|
|
@ -0,0 +1,556 @@
|
||||||
|
-- |
|
||||||
|
-- Module : Data.Text.Fuzzy.Tokenize
|
||||||
|
-- Copyright : Dmitry Zuikov 2020
|
||||||
|
-- License : MIT
|
||||||
|
--
|
||||||
|
-- Maintainer : dzuikov@gmail.com
|
||||||
|
-- Stability : experimental
|
||||||
|
-- Portability : unknown
|
||||||
|
--
|
||||||
|
-- The lightweight and multi-functional text tokenizer allowing different types of text tokenization
|
||||||
|
-- depending on it's settings.
|
||||||
|
--
|
||||||
|
-- It may be used in different sutiations, for DSL, text markups or even for parsing simple grammars
|
||||||
|
-- easier and sometimes faster than in case of usage mainstream parsing combinators or parser
|
||||||
|
-- generators.
|
||||||
|
--
|
||||||
|
-- The primary goal of this package is to parse unstructured text data, however it may be used for
|
||||||
|
-- parsing such data formats as CSV with ease.
|
||||||
|
--
|
||||||
|
-- Currently it supports the following types of entities: atoms, string literals (currently with the
|
||||||
|
-- minimal set of escaped characters), punctuation characters and delimeters.
|
||||||
|
--
|
||||||
|
-- == Examples
|
||||||
|
-- === Simple CSV-like tokenization
|
||||||
|
-- >>> tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
|
||||||
|
-- ["aaa "," bebeb "," qqq "]
|
||||||
|
--
|
||||||
|
-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
|
||||||
|
-- ["aaa "," bebeb "," qqq ","","","",""]
|
||||||
|
--
|
||||||
|
-- >>>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Maybe Text]
|
||||||
|
-- [Just "aaa ",Just " bebeb ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
--
|
||||||
|
-- >>> tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||||
|
-- [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
--
|
||||||
|
-- >>> let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
|
||||||
|
-- >>> tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||||
|
-- [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
--
|
||||||
|
-- >>> let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
|
||||||
|
-- >>> tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||||
|
-- [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
--
|
||||||
|
-- == Notes
|
||||||
|
--
|
||||||
|
-- === About the delimeter tokens
|
||||||
|
-- This type of tokens appears during a "delimited"
|
||||||
|
-- formats processing and disappears in results. Currenly
|
||||||
|
-- you will never see it unless normalization is turned off by 'nn' option.
|
||||||
|
--
|
||||||
|
-- The delimeters make sense in case of processing the CSV-like formats,
|
||||||
|
-- but in this case you probably need only values in results.
|
||||||
|
--
|
||||||
|
-- This behavior may be changed later. But right now delimeters seem pointless
|
||||||
|
-- in results. If you process some sort of grammar where delimeter character
|
||||||
|
-- is important, you may use punctuation instead, i.e:
|
||||||
|
--
|
||||||
|
-- >>> let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
|
||||||
|
-- >>> tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
|
||||||
|
-- ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
|
||||||
|
--
|
||||||
|
-- == Other
|
||||||
|
-- For CSV-like formats it makes sense to split text to lines first,
|
||||||
|
-- otherwise newline characters may cause to weird results
|
||||||
|
--
|
||||||
|
--
|
||||||
|
|
||||||
|
module Data.Text.Fuzzy.Tokenize ( TokenizeSpec
|
||||||
|
, IsToken(..)
|
||||||
|
, tokenize
|
||||||
|
, esc
|
||||||
|
, addEmptyFields
|
||||||
|
, emptyFields
|
||||||
|
, nn
|
||||||
|
, sq
|
||||||
|
, sqq
|
||||||
|
, noslits
|
||||||
|
, sl
|
||||||
|
, sr
|
||||||
|
, uw
|
||||||
|
, delims
|
||||||
|
, comment
|
||||||
|
, punct
|
||||||
|
, indent
|
||||||
|
, itabstops
|
||||||
|
, keywords
|
||||||
|
, eol
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Prelude hiding (init)
|
||||||
|
|
||||||
|
import Control.Applicative
|
||||||
|
import Data.Map (Map)
|
||||||
|
import Data.Maybe (fromMaybe)
|
||||||
|
import Data.Monoid()
|
||||||
|
import Data.Set (Set)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import qualified Data.List as List
|
||||||
|
import qualified Data.Map as Map
|
||||||
|
import qualified Data.Set as Set
|
||||||
|
import qualified Data.Text as Text
|
||||||
|
|
||||||
|
import Control.Monad (when)
|
||||||
|
import Control.Monad.RWS
|
||||||
|
|
||||||
|
-- | Tokenization settings. Use mempty for an empty value
|
||||||
|
-- and construction functions for changing the settings.
|
||||||
|
--
|
||||||
|
data TokenizeSpec = TokenizeSpec { tsAtoms :: Set Text
|
||||||
|
, tsStringQQ :: Maybe Bool
|
||||||
|
, tsStringQ :: Maybe Bool
|
||||||
|
, tsNoSlits :: Maybe Bool
|
||||||
|
, tsLineComment :: Map Char Text
|
||||||
|
, tsDelims :: Set Char
|
||||||
|
, tsEol :: Maybe Bool
|
||||||
|
, tsStripLeft :: Maybe Bool
|
||||||
|
, tsStripRight :: Maybe Bool
|
||||||
|
, tsUW :: Maybe Bool
|
||||||
|
, tsNotNormalize :: Maybe Bool
|
||||||
|
, tsEsc :: Maybe Bool
|
||||||
|
, tsAddEmptyFields :: Maybe Bool
|
||||||
|
, tsPunct :: Set Char
|
||||||
|
, tsIndent :: Maybe Bool
|
||||||
|
, tsItabStops :: Maybe Int
|
||||||
|
, tsKeywords :: Set Text
|
||||||
|
}
|
||||||
|
deriving (Eq,Ord,Show)
|
||||||
|
|
||||||
|
|
||||||
|
instance Semigroup TokenizeSpec where
|
||||||
|
(<>) a b = TokenizeSpec { tsAtoms = tsAtoms b <> tsAtoms a
|
||||||
|
, tsStringQQ = tsStringQQ b <|> tsStringQQ a
|
||||||
|
, tsStringQ = tsStringQ b <|> tsStringQ a
|
||||||
|
, tsNoSlits = tsNoSlits b <|> tsNoSlits a
|
||||||
|
, tsLineComment = tsLineComment b <> tsLineComment a
|
||||||
|
, tsDelims = tsDelims b <> tsDelims a
|
||||||
|
, tsEol = tsEol b <|> tsEol a
|
||||||
|
, tsStripLeft = tsStripLeft b <|> tsStripLeft a
|
||||||
|
, tsStripRight = tsStripRight b <|> tsStripRight a
|
||||||
|
, tsUW = tsUW b <|> tsUW a
|
||||||
|
, tsNotNormalize = tsNotNormalize b <|> tsNotNormalize a
|
||||||
|
, tsEsc = tsEsc b <|> tsEsc a
|
||||||
|
, tsAddEmptyFields = tsAddEmptyFields b <|> tsAddEmptyFields a
|
||||||
|
, tsPunct = tsPunct b <> tsPunct a
|
||||||
|
, tsIndent = tsIndent b <|> tsIndent a
|
||||||
|
, tsItabStops = tsItabStops b <|> tsItabStops a
|
||||||
|
, tsKeywords = tsKeywords b <> tsKeywords a
|
||||||
|
}
|
||||||
|
|
||||||
|
instance Monoid TokenizeSpec where
|
||||||
|
mempty = TokenizeSpec { tsAtoms = mempty
|
||||||
|
, tsStringQQ = Nothing
|
||||||
|
, tsStringQ = Nothing
|
||||||
|
, tsNoSlits = Nothing
|
||||||
|
, tsLineComment = mempty
|
||||||
|
, tsDelims = mempty
|
||||||
|
, tsEol = Nothing
|
||||||
|
, tsStripLeft = Nothing
|
||||||
|
, tsStripRight = Nothing
|
||||||
|
, tsUW = Nothing
|
||||||
|
, tsNotNormalize = Nothing
|
||||||
|
, tsEsc = Nothing
|
||||||
|
, tsAddEmptyFields = Nothing
|
||||||
|
, tsPunct = mempty
|
||||||
|
, tsIndent = Nothing
|
||||||
|
, tsItabStops = Nothing
|
||||||
|
, tsKeywords = mempty
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
justTrue :: Maybe Bool -> Bool
|
||||||
|
justTrue (Just True) = True
|
||||||
|
justTrue _ = False
|
||||||
|
|
||||||
|
-- | Turns on EOL token generation
|
||||||
|
eol :: TokenizeSpec
|
||||||
|
eol = mempty { tsEol = pure True }
|
||||||
|
|
||||||
|
-- | Turn on character escaping inside string literals.
|
||||||
|
-- Currently the following escaped characters are
|
||||||
|
-- supported: [" ' \ t n r a b f v ]
|
||||||
|
esc :: TokenizeSpec
|
||||||
|
esc = mempty { tsEsc = pure True }
|
||||||
|
|
||||||
|
-- | Raise empty field tokens (note mkEmpty method)
|
||||||
|
-- when no tokens found before a delimeter.
|
||||||
|
-- Useful for processing CSV-like data in
|
||||||
|
-- order to distingush empty columns
|
||||||
|
addEmptyFields :: TokenizeSpec
|
||||||
|
addEmptyFields = mempty { tsAddEmptyFields = pure True }
|
||||||
|
|
||||||
|
-- | same as addEmptyFields
|
||||||
|
emptyFields :: TokenizeSpec
|
||||||
|
emptyFields = addEmptyFields
|
||||||
|
|
||||||
|
-- | Turns off token normalization. Makes the tokenizer
|
||||||
|
-- generate character stream. Useful for debugging.
|
||||||
|
nn :: TokenizeSpec
|
||||||
|
nn = mempty { tsNotNormalize = pure True }
|
||||||
|
|
||||||
|
-- | Turns on single-quoted string literals.
|
||||||
|
-- Character stream after '\'' character
|
||||||
|
-- will be proceesed as single-quoted stream,
|
||||||
|
-- assuming all delimeter, comment and other special
|
||||||
|
-- characters as a part of the string literal until
|
||||||
|
-- the next unescaped single quote character.
|
||||||
|
sq :: TokenizeSpec
|
||||||
|
sq = mempty { tsStringQ = pure True }
|
||||||
|
|
||||||
|
-- | Enable double-quoted string literals support
|
||||||
|
-- as 'sq' for single-quoted strings.
|
||||||
|
sqq :: TokenizeSpec
|
||||||
|
sqq = mempty { tsStringQQ = pure True }
|
||||||
|
|
||||||
|
-- | Disable separate string literals.
|
||||||
|
--
|
||||||
|
-- Useful when processed delimeted data (csv-like formats).
|
||||||
|
-- Normally, sequential text chunks are concatenated together,
|
||||||
|
-- but consequent text and string literal will produce the two
|
||||||
|
-- different tokens and it may cause weird results if data
|
||||||
|
-- is in csv-like format, i.e:
|
||||||
|
--
|
||||||
|
-- >>> tokenize (delims ":"<>emptyFields<>sq ) "aaa:bebe:'qq' aaa:next::" :: [Maybe Text]
|
||||||
|
-- [Just "aaa",Just "bebe",Just "qq",Just " aaa",Just "next",Nothing,Nothing]
|
||||||
|
--
|
||||||
|
-- look: "qq" and " aaa" are turned into two separate tokens that makes the result
|
||||||
|
-- of CSV processing looks improper, like it has an extra-column. This behavior may be
|
||||||
|
-- avoided using this option, if you don't need to distinguish text chunks and string
|
||||||
|
-- literals:
|
||||||
|
--
|
||||||
|
-- >>> tokenize (delims ":"<>emptyFields<>sq<>noslits) "aaa:bebe:'qq:foo' aaa:next::" :: [Maybe Text]
|
||||||
|
-- [Just "aaa",Just "bebe",Just "qq:foo aaa",Just "next",Nothing,Nothing]
|
||||||
|
--
|
||||||
|
noslits :: TokenizeSpec
|
||||||
|
noslits = mempty { tsNoSlits = pure True }
|
||||||
|
|
||||||
|
-- | Specify the list of delimers (characters)
|
||||||
|
-- to split the character stream into fields. Useful for CSV-like separated formats. Support for
|
||||||
|
-- empty fields in token stream may be enabled by 'addEmptyFields' function
|
||||||
|
delims :: String -> TokenizeSpec
|
||||||
|
delims s = mempty { tsDelims = Set.fromList s }
|
||||||
|
|
||||||
|
-- | Strip spaces on left side of a token.
|
||||||
|
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
|
||||||
|
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
|
||||||
|
sl :: TokenizeSpec
|
||||||
|
sl = mempty { tsStripLeft = pure True }
|
||||||
|
|
||||||
|
-- | Strip spaces on right side of a token.
|
||||||
|
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
|
||||||
|
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
|
||||||
|
sr :: TokenizeSpec
|
||||||
|
sr = mempty { tsStripRight = pure True }
|
||||||
|
|
||||||
|
-- | Strips spaces on right and left sides and transforms multiple spaces into the one.
|
||||||
|
-- Name origins from unwords . words
|
||||||
|
--
|
||||||
|
-- Does not affect string literals, i.e string are processed normally. Useful mostly for
|
||||||
|
-- processing CSV-like formats, otherwise 'delims' may be used to skip unwanted spaces.
|
||||||
|
uw :: TokenizeSpec
|
||||||
|
uw = mempty { tsUW = pure True }
|
||||||
|
|
||||||
|
-- | Specify the line comment prefix.
|
||||||
|
-- All text after the line comment prefix will
|
||||||
|
-- be ignored until the newline character appearance.
|
||||||
|
-- Multiple line comments are supported.
|
||||||
|
comment :: Text -> TokenizeSpec
|
||||||
|
comment s = mempty { tsLineComment = cmt }
|
||||||
|
where
|
||||||
|
cmt = case Text.uncons s of
|
||||||
|
Just (p,su) -> Map.singleton p su
|
||||||
|
Nothing -> mempty
|
||||||
|
|
||||||
|
-- | Specify the punctuation characters.
|
||||||
|
-- Any punctuation character is handled as a separate
|
||||||
|
-- token.
|
||||||
|
-- Any token will be breaked on a punctiation character.
|
||||||
|
--
|
||||||
|
-- Useful for handling ... er... punctuaton, like
|
||||||
|
--
|
||||||
|
-- >> function(a,b)
|
||||||
|
--
|
||||||
|
-- or
|
||||||
|
--
|
||||||
|
-- >> (apply function 1 2 3)
|
||||||
|
--
|
||||||
|
--
|
||||||
|
-- >>> tokenize spec "(apply function 1 2 3)" :: [Text]
|
||||||
|
-- ["(","apply","function","1","2","3",")"]
|
||||||
|
--
|
||||||
|
punct :: Text -> TokenizeSpec
|
||||||
|
punct s = mempty { tsPunct = Set.fromList (Text.unpack s) }
|
||||||
|
|
||||||
|
-- | Specify the keywords list.
|
||||||
|
-- Each keyword will be threated as a separate token.
|
||||||
|
keywords :: [Text] -> TokenizeSpec
|
||||||
|
keywords s = mempty { tsKeywords = Set.fromList s }
|
||||||
|
|
||||||
|
-- | Enable identation support
|
||||||
|
indent :: TokenizeSpec
|
||||||
|
indent = mempty { tsIndent = Just True }
|
||||||
|
|
||||||
|
-- | Set tab expanding multiplier
|
||||||
|
-- i.e. each tab extends into n spaces before processing.
|
||||||
|
-- It also turns on the indentation. Only the tabs at the beginning of the string are expanded,
|
||||||
|
-- i.e. before the first non-space character appears.
|
||||||
|
itabstops :: Int -> TokenizeSpec
|
||||||
|
itabstops n = mempty { tsIndent = Just True, tsItabStops = pure n }
|
||||||
|
|
||||||
|
newtype TokenizeM w a = TokenizeM (RWS TokenizeSpec w () a)
|
||||||
|
deriving( Applicative
|
||||||
|
, Functor
|
||||||
|
, MonadReader TokenizeSpec
|
||||||
|
, MonadWriter w
|
||||||
|
, MonadState ()
|
||||||
|
, Monad
|
||||||
|
)
|
||||||
|
|
||||||
|
data Token = TChar Char
|
||||||
|
| TSChar Char
|
||||||
|
| TPunct Char
|
||||||
|
| TText Text
|
||||||
|
| TSLit Text
|
||||||
|
| TKeyword Text
|
||||||
|
| TEmpty
|
||||||
|
| TDelim
|
||||||
|
| TIndent Int
|
||||||
|
| TEol
|
||||||
|
deriving (Eq,Ord,Show)
|
||||||
|
|
||||||
|
-- | Typeclass for token values.
|
||||||
|
-- Note, that some tokens appear in results
|
||||||
|
-- only when 'nn' option is set, i.e. sequences
|
||||||
|
-- of characters turn out to text tokens or string literals
|
||||||
|
-- and delimeter tokens are just removed from the
|
||||||
|
-- results
|
||||||
|
class IsToken a where
|
||||||
|
-- | Create a character token
|
||||||
|
mkChar :: Char -> a
|
||||||
|
-- | Create a string literal character token
|
||||||
|
mkSChar :: Char -> a
|
||||||
|
-- | Create a punctuation token
|
||||||
|
mkPunct :: Char -> a
|
||||||
|
-- | Create a text chunk token
|
||||||
|
mkText :: Text -> a
|
||||||
|
-- | Create a string literal token
|
||||||
|
mkStrLit :: Text -> a
|
||||||
|
-- | Create a keyword token
|
||||||
|
mkKeyword :: Text -> a
|
||||||
|
-- | Create an empty field token
|
||||||
|
mkEmpty :: a
|
||||||
|
-- | Create a delimeter token
|
||||||
|
mkDelim :: a
|
||||||
|
mkDelim = mkEmpty
|
||||||
|
|
||||||
|
-- | Creates an indent token
|
||||||
|
mkIndent :: Int -> a
|
||||||
|
mkIndent = const mkEmpty
|
||||||
|
|
||||||
|
-- | Creates an EOL token
|
||||||
|
mkEol :: a
|
||||||
|
mkEol = mkEmpty
|
||||||
|
|
||||||
|
instance IsToken (Maybe Text) where
|
||||||
|
mkChar = pure . Text.singleton
|
||||||
|
mkSChar = pure . Text.singleton
|
||||||
|
mkPunct = pure . Text.singleton
|
||||||
|
mkText = pure
|
||||||
|
mkStrLit = pure
|
||||||
|
mkKeyword = pure
|
||||||
|
mkEmpty = Nothing
|
||||||
|
|
||||||
|
instance IsToken Text where
|
||||||
|
mkChar = Text.singleton
|
||||||
|
mkSChar = Text.singleton
|
||||||
|
mkPunct = Text.singleton
|
||||||
|
mkText = id
|
||||||
|
mkStrLit = id
|
||||||
|
mkKeyword = id
|
||||||
|
mkEmpty = ""
|
||||||
|
|
||||||
|
-- | Tokenize a text
|
||||||
|
tokenize :: IsToken a => TokenizeSpec -> Text -> [a]
|
||||||
|
tokenize s t = map tr t1
|
||||||
|
where
|
||||||
|
t1 = tokenize' s t
|
||||||
|
tr (TChar c) = mkChar c
|
||||||
|
tr (TSChar c) = mkSChar c
|
||||||
|
tr (TText c) = mkText c
|
||||||
|
tr (TSLit c) = mkStrLit c
|
||||||
|
tr (TKeyword c) = mkKeyword c
|
||||||
|
tr TEmpty = mkEmpty
|
||||||
|
tr (TPunct c) = mkPunct c
|
||||||
|
tr TDelim = mkDelim
|
||||||
|
tr (TIndent n) = mkIndent n
|
||||||
|
tr TEol = mkEol
|
||||||
|
|
||||||
|
execTokenizeM :: TokenizeM [Token] a -> TokenizeSpec -> [Token]
|
||||||
|
execTokenizeM (TokenizeM m) spec =
|
||||||
|
let (_,w) = execRWS m spec () in norm w
|
||||||
|
|
||||||
|
where norm x | justTrue (tsNotNormalize spec) = x
|
||||||
|
| otherwise = normalize spec x
|
||||||
|
|
||||||
|
tokenize' :: TokenizeSpec -> Text -> [Token]
|
||||||
|
tokenize' spec txt = execTokenizeM (root' txt) spec
|
||||||
|
where
|
||||||
|
|
||||||
|
r = spec
|
||||||
|
|
||||||
|
noIndent = not doIndent
|
||||||
|
doIndent = justTrue (tsIndent r)
|
||||||
|
eolOk = justTrue (tsEol r)
|
||||||
|
|
||||||
|
root' x = scanIndent x >>= root
|
||||||
|
|
||||||
|
root ts = do
|
||||||
|
|
||||||
|
case Text.uncons ts of
|
||||||
|
Nothing -> pure ()
|
||||||
|
|
||||||
|
Just ('\n', rest) | doIndent -> raiseEol >> root' rest
|
||||||
|
Just (c, rest) | Set.member c (tsDelims r) -> tell [TDelim] >> root rest
|
||||||
|
Just ('\'', rest) | justTrue (tsStringQ r) -> scanQ '\'' rest
|
||||||
|
Just ('"', rest) | justTrue (tsStringQQ r) -> scanQ '"' rest
|
||||||
|
|
||||||
|
Just (c, rest) | Map.member c (tsLineComment r) -> scanComment (c,rest)
|
||||||
|
|
||||||
|
Just (c, rest) | Set.member c (tsPunct r) -> tell [TPunct c] >> root rest
|
||||||
|
|
||||||
|
Just (c, rest) | otherwise -> tell [TChar c] >> root rest
|
||||||
|
|
||||||
|
|
||||||
|
raiseEol | eolOk = tell [TEol]
|
||||||
|
| otherwise = pure ()
|
||||||
|
|
||||||
|
expandSpace ' ' = 1
|
||||||
|
expandSpace '\t' = (fromMaybe 8 (tsItabStops r))
|
||||||
|
expandSpace _ = 0
|
||||||
|
|
||||||
|
scanIndent x | noIndent = pure x
|
||||||
|
| otherwise = do
|
||||||
|
let (ss,as) = Text.span (\c -> c == ' ' || c == '\t') x
|
||||||
|
tell [ TIndent (sum (map expandSpace (Text.unpack ss))) ]
|
||||||
|
pure as
|
||||||
|
|
||||||
|
scanComment (c,rest) = do
|
||||||
|
suff <- Map.lookup c <$> asks tsLineComment
|
||||||
|
case suff of
|
||||||
|
Just t | Text.isPrefixOf t rest -> do
|
||||||
|
root $ Text.dropWhile ('\n' /=) rest
|
||||||
|
|
||||||
|
_ -> tell [TChar c] >> root rest
|
||||||
|
|
||||||
|
scanQ q ts = do
|
||||||
|
|
||||||
|
case Text.uncons ts of
|
||||||
|
Nothing -> root ts
|
||||||
|
|
||||||
|
Just ('\\', rest) | justTrue (tsEsc r) -> unesc (scanQ q) rest
|
||||||
|
| otherwise -> tell [tsChar '\\'] >> scanQ q rest
|
||||||
|
|
||||||
|
Just (c, rest) | c == q -> root rest
|
||||||
|
| otherwise -> tell [tsChar c] >> scanQ q rest
|
||||||
|
|
||||||
|
unesc f ts =
|
||||||
|
case Text.uncons ts of
|
||||||
|
Nothing -> f ts
|
||||||
|
Just ('"', rs) -> tell [tsChar '"' ] >> f rs
|
||||||
|
Just ('\'', rs) -> tell [tsChar '\''] >> f rs
|
||||||
|
Just ('\\', rs) -> tell [tsChar '\\'] >> f rs
|
||||||
|
Just ('t', rs) -> tell [tsChar '\t'] >> f rs
|
||||||
|
Just ('n', rs) -> tell [tsChar '\n'] >> f rs
|
||||||
|
Just ('r', rs) -> tell [tsChar '\r'] >> f rs
|
||||||
|
Just ('a', rs) -> tell [tsChar '\a'] >> f rs
|
||||||
|
Just ('b', rs) -> tell [tsChar '\b'] >> f rs
|
||||||
|
Just ('f', rs) -> tell [tsChar '\f'] >> f rs
|
||||||
|
Just ('v', rs) -> tell [tsChar '\v'] >> f rs
|
||||||
|
Just (_, rs) -> f rs
|
||||||
|
|
||||||
|
tsChar c | justTrue (tsNoSlits spec) = TChar c
|
||||||
|
| otherwise = TSChar c
|
||||||
|
|
||||||
|
newtype NormStats = NormStats { nstatBeforeDelim :: Int }
|
||||||
|
|
||||||
|
normalize :: TokenizeSpec -> [Token] -> [Token]
|
||||||
|
normalize spec tokens = snd $ execRWS (go tokens) () init
|
||||||
|
where
|
||||||
|
|
||||||
|
go [] = addEmptyField
|
||||||
|
|
||||||
|
go s@(TIndent _ : _) = do
|
||||||
|
let (iis, rest') = List.span isIndent s
|
||||||
|
tell [TIndent (sum [k | TIndent k <- iis])]
|
||||||
|
go rest'
|
||||||
|
|
||||||
|
go (TChar c0 : cs) = do
|
||||||
|
let (n,ns) = List.span isTChar cs
|
||||||
|
succStat
|
||||||
|
let chunk = eatSpaces $ Text.pack (c0 : [ c | TChar c <- n])
|
||||||
|
let kw = Set.member chunk (tsKeywords spec)
|
||||||
|
tell [ if kw then TKeyword chunk else TText chunk ]
|
||||||
|
go ns
|
||||||
|
|
||||||
|
go (TSChar x : xs) = do
|
||||||
|
let (n,ns) = List.span isTSChar xs
|
||||||
|
succStat
|
||||||
|
tell [ TSLit $ Text.pack (x : [ c | TSChar c <- n]) ]
|
||||||
|
go ns
|
||||||
|
|
||||||
|
go (TDelim : xs) = do
|
||||||
|
addEmptyField
|
||||||
|
pruneStat
|
||||||
|
go xs
|
||||||
|
|
||||||
|
go (TPunct c : xs) = do
|
||||||
|
tell [ TPunct c ]
|
||||||
|
succStat
|
||||||
|
go xs
|
||||||
|
|
||||||
|
go (x:xs) = tell [x] >> go xs
|
||||||
|
|
||||||
|
succStat = do
|
||||||
|
modify (\x -> x { nstatBeforeDelim = succ (nstatBeforeDelim x)})
|
||||||
|
|
||||||
|
pruneStat = do
|
||||||
|
modify (\x -> x { nstatBeforeDelim = 0 } )
|
||||||
|
|
||||||
|
addEmptyField = do
|
||||||
|
ns <- gets nstatBeforeDelim
|
||||||
|
when (ns == 0 && justTrue (tsAddEmptyFields spec) ) $ do
|
||||||
|
tell [ TEmpty ]
|
||||||
|
|
||||||
|
isTChar (TChar _) = True
|
||||||
|
isTChar _ = False
|
||||||
|
|
||||||
|
isTSChar (TSChar _) = True
|
||||||
|
isTSChar _ = False
|
||||||
|
|
||||||
|
isIndent (TIndent _) = True
|
||||||
|
isIndent _ = False
|
||||||
|
|
||||||
|
init = NormStats { nstatBeforeDelim = 0 }
|
||||||
|
|
||||||
|
eatSpaces s | sboth = Text.strip s
|
||||||
|
| sLonly = Text.stripStart s
|
||||||
|
| sRonly = Text.stripEnd s
|
||||||
|
| sWU = (Text.unwords . Text.words) s
|
||||||
|
| otherwise = s
|
||||||
|
|
||||||
|
where sboth = justTrue (tsStripLeft spec) && justTrue (tsStripRight spec)
|
||||||
|
sLonly = justTrue (tsStripLeft spec) && not (justTrue (tsStripRight spec))
|
||||||
|
sRonly = not (justTrue (tsStripLeft spec)) && justTrue (tsStripRight spec)
|
||||||
|
sWU = justTrue (tsUW spec)
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
module Data.Text.LSH where
|
|
@ -0,0 +1,150 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings
|
||||||
|
, QuasiQuotes
|
||||||
|
, ExtendedDefaultRules
|
||||||
|
, LambdaCase
|
||||||
|
, ImportQualifiedPost
|
||||||
|
, DerivingStrategies
|
||||||
|
, PatternSynonyms
|
||||||
|
, ViewPatterns
|
||||||
|
, MultiWayIf
|
||||||
|
, TemplateHaskell
|
||||||
|
#-}
|
||||||
|
|
||||||
|
module FuzzyParseSpec (spec) where
|
||||||
|
|
||||||
|
import Data.Text (Text)
|
||||||
|
import Test.Hspec
|
||||||
|
import Text.InterpolatedString.Perl6 (q)
|
||||||
|
|
||||||
|
import Data.Text.Fuzzy.Tokenize
|
||||||
|
import Data.Data
|
||||||
|
import Data.Generics.Uniplate.Data()
|
||||||
|
import GHC.Generics
|
||||||
|
|
||||||
|
data TTok = TChar Char
|
||||||
|
| TSChar Char
|
||||||
|
| TPunct Char
|
||||||
|
| TText Text
|
||||||
|
| TStrLit Text
|
||||||
|
| TKeyword Text
|
||||||
|
| TEmpty
|
||||||
|
| TIndent Int
|
||||||
|
deriving stock (Eq,Ord,Show,Data,Generic)
|
||||||
|
|
||||||
|
instance IsToken TTok where
|
||||||
|
mkChar = TChar
|
||||||
|
mkSChar = TSChar
|
||||||
|
mkPunct = TPunct
|
||||||
|
mkText = TText
|
||||||
|
mkStrLit = TStrLit
|
||||||
|
mkKeyword = TKeyword
|
||||||
|
mkEmpty = TEmpty
|
||||||
|
mkIndent = TIndent
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
spec :: Spec
|
||||||
|
spec = do
|
||||||
|
describe "csv-like" $ do
|
||||||
|
it "splits text using ':' delimeter" $ do
|
||||||
|
let toks = tokenize (delims ":") "aaa : bebeb : qqq ::::" :: [Text]
|
||||||
|
toks `shouldBe` ["aaa "," bebeb "," qqq "]
|
||||||
|
|
||||||
|
it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
|
||||||
|
let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : bebeb : qqq ::::" :: [Text]
|
||||||
|
toks `shouldBe` ["aaa "," bebeb "," qqq ","","","",""]
|
||||||
|
|
||||||
|
it "splits text using ':' delimeter with single-quotes string and empty fields" $ do
|
||||||
|
let toks = tokenize (delims ":"<>sq<>emptyFields ) "aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||||
|
toks `shouldBe` [Just "aaa ",Just " ",Just "bebeb:colon inside",Just " ",Just " qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
|
||||||
|
it "splits text using ':' delimeter with single-quotes string and empty fields with noslits" $ do
|
||||||
|
let spec = sl<>delims ":"<>sq<>emptyFields<>noslits
|
||||||
|
let toks = tokenize spec " aaa : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||||
|
toks `shouldBe` [Just "aaa ",Just "bebeb:colon inside ",Just "qqq ",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
|
||||||
|
it "splits text using ':' delimeter with single-quotes string and empty fields with noslits and uw" $ do
|
||||||
|
let spec = delims ":"<>sq<>emptyFields<>uw<>noslits
|
||||||
|
let toks = tokenize spec " a b c : 'bebeb:colon inside' : qqq ::::" :: [Maybe Text]
|
||||||
|
toks `shouldBe` [Just "a b c",Just "bebeb:colon inside",Just "qqq",Nothing,Nothing,Nothing,Nothing]
|
||||||
|
|
||||||
|
it "uses punctuation tokens" $ do
|
||||||
|
let spec = delims " \t"<>punct ",;()" <>emptyFields<>sq
|
||||||
|
let toks = tokenize spec "( delimeters , are , important, 'spaces are not');" :: [Text]
|
||||||
|
toks `shouldBe` ["(","delimeters",",","are",",","important",",","spaces are not",")",";"]
|
||||||
|
|
||||||
|
|
||||||
|
it "tokenize simple lisp-like text with keywords" $ do
|
||||||
|
let spec = delims " \n\t" <> comment ";"
|
||||||
|
<> punct "{}()[]<>"
|
||||||
|
<> sq <> sqq
|
||||||
|
<> uw
|
||||||
|
<> keywords ["define","apply","+"]
|
||||||
|
|
||||||
|
let code = [q|
|
||||||
|
(define add (a b ) ; define simple function
|
||||||
|
(+ a b) )
|
||||||
|
(define r (add 10 20))
|
||||||
|
|]
|
||||||
|
|
||||||
|
let toks = tokenize spec code :: [TTok]
|
||||||
|
|
||||||
|
let expected = [ TPunct '('
|
||||||
|
, TKeyword "define"
|
||||||
|
, TText "add" , TPunct '(', TText "a" , TText "b", TPunct ')'
|
||||||
|
, TPunct '(', TKeyword "+", TText "a",TText "b",TPunct ')',TPunct ')'
|
||||||
|
, TPunct '(',TKeyword "define"
|
||||||
|
,TText "r"
|
||||||
|
,TPunct '(',TText "add",TText "10",TText "20"
|
||||||
|
,TPunct ')',TPunct ')']
|
||||||
|
|
||||||
|
toks `shouldBe` expected
|
||||||
|
|
||||||
|
|
||||||
|
describe "Checks indentation support" $ do
|
||||||
|
|
||||||
|
let spec = delims " \n\t" <> comment ";"
|
||||||
|
<> punct "{}()[]<>"
|
||||||
|
<> sq <> sqq
|
||||||
|
<> uw
|
||||||
|
<> indent
|
||||||
|
<> itabstops 8
|
||||||
|
<> keywords ["define"]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
it "parses some indented blocks" $ do
|
||||||
|
|
||||||
|
let expected = [ TIndent 0, TKeyword "define", TText "a", TText "0"
|
||||||
|
, TIndent 2, TText "atom", TText "foo", TText "2"
|
||||||
|
, TIndent 2, TKeyword "define", TText "aq", TText "2"
|
||||||
|
, TIndent 4, TText "atom", TText "one", TText "4"
|
||||||
|
, TIndent 4, TText "atom", TText "two", TText "4"
|
||||||
|
, TIndent 0, TKeyword "define", TText "b", TText "0"
|
||||||
|
, TIndent 2, TText "atom", TText "baar", TText "2"
|
||||||
|
, TIndent 2, TText "atom", TText "quux", TText "2"
|
||||||
|
, TIndent 2, TKeyword "define", TText "new", TText "2"
|
||||||
|
, TIndent 6, TText "atom", TText "bar", TText "6"
|
||||||
|
, TIndent 4, TText "atom", TText "fuu", TText "4"
|
||||||
|
, TIndent 0
|
||||||
|
]
|
||||||
|
|
||||||
|
let pyLike = [q|
|
||||||
|
define a 0
|
||||||
|
atom foo 2
|
||||||
|
define aq 2
|
||||||
|
atom one 4
|
||||||
|
atom two 4
|
||||||
|
|
||||||
|
define b 0
|
||||||
|
atom baar 2
|
||||||
|
atom quux 2
|
||||||
|
define new 2
|
||||||
|
atom bar 6
|
||||||
|
atom fuu 4
|
||||||
|
|
||||||
|
|]
|
||||||
|
let toks = tokenize spec pyLike :: [TTok]
|
||||||
|
toks `shouldBe` expected
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
{-# OPTIONS_GHC -F -pgmF hspec-discover #-}
|
Loading…
Reference in New Issue