mirror of https://github.com/voidlizard/hbs2
wip
This commit is contained in:
parent
050603f82b
commit
7f344a7f72
|
@ -51,6 +51,7 @@ import Codec.Compression.Zstd qualified as Zstd
|
||||||
import Codec.Compression.Zstd.Streaming qualified as ZstdS
|
import Codec.Compression.Zstd.Streaming qualified as ZstdS
|
||||||
import Codec.Compression.Zstd.Streaming (Result(..))
|
import Codec.Compression.Zstd.Streaming (Result(..))
|
||||||
import Codec.Compression.Zstd (maxCLevel)
|
import Codec.Compression.Zstd (maxCLevel)
|
||||||
|
import Codec.Compression.Zstd.Lazy qualified as ZstdL
|
||||||
|
|
||||||
import Codec.Compression.Zlib qualified as Zlib
|
import Codec.Compression.Zlib qualified as Zlib
|
||||||
|
|
||||||
|
@ -1288,17 +1289,37 @@ consumed = gets LBS.null
|
||||||
runConsumeLBS :: Monad m => ByteString -> ConsumeLBS m a -> m a
|
runConsumeLBS :: Monad m => ByteString -> ConsumeLBS m a -> m a
|
||||||
runConsumeLBS s m = evalStateT (fromConsumeLBS m) s
|
runConsumeLBS s m = evalStateT (fromConsumeLBS m) s
|
||||||
|
|
||||||
|
newtype ConsumeBS m a = ConsumeBS { fromConsumeBS :: StateT BS.ByteString m a }
|
||||||
|
deriving newtype ( Applicative
|
||||||
|
, Functor
|
||||||
|
, Monad
|
||||||
|
, MonadState BS.ByteString
|
||||||
|
, MonadIO
|
||||||
|
, MonadTrans
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
instance Monad m => BytesReader (ConsumeLBS m) where
|
instance Monad m => BytesReader (ConsumeLBS m) where
|
||||||
readBytes n = readChunkSimple n
|
readBytes n = readChunkSimple n
|
||||||
noBytesLeft = consumed
|
noBytesLeft = consumed
|
||||||
|
|
||||||
readLogFileLBS :: forall opts m . ( MonadIO m, ReadLogOpts opts )
|
instance Monad m => BytesReader (ConsumeBS m) where
|
||||||
|
noBytesLeft = gets BS.null
|
||||||
|
readBytes n = do
|
||||||
|
s <- get
|
||||||
|
let (a,b) = BS.splitAt n s
|
||||||
|
put $! b
|
||||||
|
pure (LBS.fromStrict a)
|
||||||
|
|
||||||
|
runConsumeBS :: Monad m => BS.ByteString -> ConsumeBS m a -> m a
|
||||||
|
runConsumeBS s m = evalStateT (fromConsumeBS m) s
|
||||||
|
|
||||||
|
readLogFileLBS :: forall opts m . ( MonadIO m, ReadLogOpts opts, BytesReader m )
|
||||||
=> opts
|
=> opts
|
||||||
-> ByteString
|
|
||||||
-> ( GitHash -> Int -> ByteString -> m () )
|
-> ( GitHash -> Int -> ByteString -> m () )
|
||||||
-> m Int
|
-> m Int
|
||||||
|
|
||||||
readLogFileLBS _ lbs action = runConsumeLBS lbs $ flip fix 0 \go n -> do
|
readLogFileLBS _ action = flip fix 0 \go n -> do
|
||||||
done <- noBytesLeft
|
done <- noBytesLeft
|
||||||
if done then pure n
|
if done then pure n
|
||||||
else do
|
else do
|
||||||
|
@ -1313,7 +1334,7 @@ readLogFileLBS _ lbs action = runConsumeLBS lbs $ flip fix 0 \go n -> do
|
||||||
sdata <- readBytesMaybe ( ssize - 20 )
|
sdata <- readBytesMaybe ( ssize - 20 )
|
||||||
>>= orThrow SomeReadLogError
|
>>= orThrow SomeReadLogError
|
||||||
|
|
||||||
void $ lift $ action hash (fromIntegral ssize) sdata
|
void $ action hash (fromIntegral ssize) sdata
|
||||||
go (succ n)
|
go (succ n)
|
||||||
|
|
||||||
-- FIXME: move-to-suckless-script
|
-- FIXME: move-to-suckless-script
|
||||||
|
@ -1799,52 +1820,102 @@ theDict = do
|
||||||
|
|
||||||
r <- S.toList_ $ for_ files $ \f -> do
|
r <- S.toList_ $ for_ files $ \f -> do
|
||||||
lbs <- liftIO $ LBS.readFile f
|
lbs <- liftIO $ LBS.readFile f
|
||||||
readLogFileLBS () lbs $ \h _ _ -> do
|
runConsumeLBS lbs $ readLogFileLBS () $ \h _ _ -> do
|
||||||
when (h == what) (S.yield f)
|
when (h == what) (lift $ S.yield f)
|
||||||
|
|
||||||
for_ (HS.fromList r) $ \x -> do
|
for_ (HS.fromList r) $ \x -> do
|
||||||
liftIO $ print x
|
liftIO $ print x
|
||||||
|
|
||||||
|
entry $ bindMatch "test:git:zstd:packed:list" $ nil_ $ \case
|
||||||
|
[StringLike fn] -> lift do
|
||||||
|
lbs <- liftIO$ LBS.readFile fn
|
||||||
|
|
||||||
|
runConsumeLBS (ZstdL.decompress lbs) $ readLogFileLBS () $ \h s _ -> do
|
||||||
|
debug $ "object" <+> pretty h <+> pretty s
|
||||||
|
|
||||||
|
_ -> throwIO (BadFormException @C nil)
|
||||||
|
|
||||||
|
entry $ bindMatch "test:git:zstd:pack" $ nil_ $ \syn -> do
|
||||||
|
|
||||||
|
let (opts,argz) = splitOpts [("-l",1)] syn
|
||||||
|
|
||||||
|
fn <- headMay [ x | StringLike x <- argz ] & orThrowUser "filename not set"
|
||||||
|
let l = headDef 5 [ fromIntegral l | ListVal [StringLike "-l", LitIntVal l ] <- opts ]
|
||||||
|
|
||||||
|
file <- liftIO $ LBS.readFile fn
|
||||||
|
let z = ZstdL.compress l file
|
||||||
|
liftIO $ LBS.writeFile (fn <> ".z") z
|
||||||
|
|
||||||
entry $ bindMatch "test:git:zstd:train" $ nil_ $ \case
|
entry $ bindMatch "test:git:zstd:train" $ nil_ $ \case
|
||||||
[ StringLike fn ] -> do
|
[ StringLike fn ] -> lift do
|
||||||
file <- liftIO $ mmapFileByteString fn Nothing
|
file <- liftIO $ mmapFileByteString fn Nothing
|
||||||
pure ()
|
|
||||||
|
_total <- newTVarIO 0
|
||||||
|
|
||||||
|
samples' <- S.toList_ ( runConsumeBS file $ readLogFileLBS () $ \h s lbs -> do
|
||||||
|
|
||||||
|
atomically $ modifyTVar _total succ
|
||||||
|
lift $ S.yield (LBS.toStrict (LBS.take (256*1024) lbs)) )
|
||||||
|
|
||||||
|
let samples = [x | (1,x) <- zip (cycle [1..10]) samples' ]
|
||||||
|
|
||||||
|
dictionary <- Zstd.trainFromSamples (256 * 1024) samples
|
||||||
|
& orThrowUser "can't train dictionary"
|
||||||
|
|
||||||
|
debug $ "dict size" <+> pretty (BS.length $ Zstd.fromDict dictionary)
|
||||||
|
|
||||||
|
flip runContT pure do
|
||||||
|
|
||||||
|
ContT $ withAsync $ forever do
|
||||||
|
pause @'Seconds 1
|
||||||
|
p <- readTVarIO _total
|
||||||
|
liftIO $ IO.hPutStr stderr (" \r" <> show (pretty p))
|
||||||
|
|
||||||
|
fh <- ContT $ withBinaryFile (fn <> ".packed") AppendMode
|
||||||
|
void $ runConsumeBS file $ readLogFileLBS () $ \h s lbs -> do
|
||||||
|
|
||||||
|
let packed = Zstd.compressUsingDict dictionary maxCLevel (LBS.toStrict lbs)
|
||||||
|
-- let packed = Zstd.compress maxCLevel (LBS.toStrict lbs)
|
||||||
|
|
||||||
|
let kbs = coerce @_ @BS.ByteString h
|
||||||
|
let keySize = BS.length kbs
|
||||||
|
|
||||||
|
let objectSize = BS.length packed & fromIntegral
|
||||||
|
let entrySize = fromIntegral $ keySize + objectSize
|
||||||
|
let entry = LBS.toStrict $ Builder.toLazyByteString $ Builder.word32BE entrySize
|
||||||
|
|
||||||
|
liftIO do
|
||||||
|
atomically $ modifyTVar _total pred
|
||||||
|
-- debug $ "entry size" <+> pretty (BS.length entry) <+> pretty h <+> pretty entrySize
|
||||||
|
BS.hPutStr fh entry
|
||||||
|
BS.hPutStr fh kbs
|
||||||
|
BS.hPutStr fh packed
|
||||||
|
|
||||||
|
|
||||||
_ -> throwIO (BadFormException @C nil)
|
_ -> throwIO (BadFormException @C nil)
|
||||||
|
|
||||||
entry $ bindMatch "test:git:read-log-file" $ nil_ $ \syn -> lift do
|
entry $ bindMatch "test:git:read-log-file" $ nil_ $ \syn -> lift do
|
||||||
let (_, argz) = splitOpts [] syn
|
let (_, argz) = splitOpts [] syn
|
||||||
fname <- headMay [ x | StringLike x <- argz] & orThrowUser "log file not set"
|
fname <- headMay [ x | StringLike x <- argz] & orThrowUser "log file not set"
|
||||||
flip runContT pure do
|
file <- liftIO $ mmapFileByteString fname Nothing
|
||||||
h <- ContT $ bracket (openFile fname ReadMode) hClose
|
void $ runConsumeBS file $ readLogFileLBS () $ \h s lbs -> do
|
||||||
|
debug $ "object" <+> pretty h <+> pretty s
|
||||||
fix \next -> do
|
|
||||||
eof <- hIsEOF h
|
|
||||||
unless eof do
|
|
||||||
size <- liftIO (BS.hGet h 4) <&> fromIntegral . N.word32
|
|
||||||
debug $ "size" <+> pretty size
|
|
||||||
bshash <- liftIO (BS.hGet h 20) <&> GitHash
|
|
||||||
hSeek h RelativeSeek (size - 20)
|
|
||||||
-- lbs <- liftIO (LBS.hGet h (size - 20))
|
|
||||||
debug $ "object" <+> pretty bshash
|
|
||||||
next
|
|
||||||
|
|
||||||
entry $ bindMatch "test:git:read-log-lbs" $ nil_ $ \syn -> lift do
|
entry $ bindMatch "test:git:read-log-lbs" $ nil_ $ \syn -> lift do
|
||||||
let (_, argz) = splitOpts [] syn
|
let (_, argz) = splitOpts [] syn
|
||||||
fname <- headMay [ x | StringLike x <- argz] & orThrowUser "log file not set"
|
fname <- headMay [ x | StringLike x <- argz] & orThrowUser "log file not set"
|
||||||
theLog <- liftIO $ LBS.readFile fname
|
theLog <- liftIO $ LBS.readFile fname
|
||||||
void $ readLogFileLBS () theLog $ \h s lbs -> do
|
void $ runConsumeLBS theLog $ readLogFileLBS () $ \h s lbs -> do
|
||||||
debug $ "object" <+> pretty h
|
debug $ "object" <+> pretty h
|
||||||
|
|
||||||
|
|
||||||
entry $ bindMatch "test:git:log:index:naive:dump" $ nil_ $ \syn -> lift do
|
entry $ bindMatch "test:git:log:index:naive:dump" $ nil_ $ \syn -> lift do
|
||||||
let (_, argz) = splitOpts [] syn
|
let (_, argz) = splitOpts [] syn
|
||||||
fname <- headMay [ x | StringLike x <- argz] & orThrowUser "no file"
|
fname <- headMay [ x | StringLike x <- argz] & orThrowUser "no file"
|
||||||
|
|
||||||
lbs <- liftIO $ LBS.readFile fname
|
bs <- liftIO $ mmapFileByteString fname Nothing
|
||||||
|
|
||||||
runConsumeLBS lbs $ flip fix 0 \go n -> do
|
runConsumeBS bs $ flip fix 0 \go n -> do
|
||||||
done <- consumed
|
done <- noBytesLeft
|
||||||
if done then pure ()
|
if done then pure ()
|
||||||
else do
|
else do
|
||||||
ssize <- readBytesMaybe 4
|
ssize <- readBytesMaybe 4
|
||||||
|
@ -1863,7 +1934,7 @@ theDict = do
|
||||||
|
|
||||||
lbs <- liftIO $ LBS.readFile fn
|
lbs <- liftIO $ LBS.readFile fn
|
||||||
|
|
||||||
hashes <- S.toList_ $ runConsumeLBS lbs $ flip fix 0 \go n -> do
|
hashes <- S.toList_ $ runConsumeLBS lbs $ flip fix 0 \go n -> do
|
||||||
done <- consumed
|
done <- consumed
|
||||||
if done then pure ()
|
if done then pure ()
|
||||||
else do
|
else do
|
||||||
|
@ -2015,8 +2086,8 @@ theDict = do
|
||||||
for_ fnames $ \f -> do
|
for_ fnames $ \f -> do
|
||||||
theLog <- liftIO $ LBS.readFile f
|
theLog <- liftIO $ LBS.readFile f
|
||||||
|
|
||||||
void $ readLogFileLBS () theLog $ \h s lbs -> do
|
void $ runConsumeLBS theLog $ readLogFileLBS () $ \h s lbs -> do
|
||||||
S.yield (coerce @_ @BS.ByteString h)
|
lift $ S.yield (coerce @_ @BS.ByteString h)
|
||||||
debug $ "object" <+> pretty h
|
debug $ "object" <+> pretty h
|
||||||
|
|
||||||
let sorted = Set.toList $ Set.fromList all
|
let sorted = Set.toList $ Set.fromList all
|
||||||
|
@ -2047,7 +2118,9 @@ theDict = do
|
||||||
|
|
||||||
_already <- newTVarIO mempty
|
_already <- newTVarIO mempty
|
||||||
|
|
||||||
let notWrittenYet x = do
|
let
|
||||||
|
notWrittenYet :: forall m . MonadIO m => GitHash -> m Bool
|
||||||
|
notWrittenYet x = do
|
||||||
already <- readTVarIO _already <&> HS.member x
|
already <- readTVarIO _already <&> HS.member x
|
||||||
-- alsoInIdx <- maybe1 mmaped (pure False) $ \m -> do
|
-- alsoInIdx <- maybe1 mmaped (pure False) $ \m -> do
|
||||||
-- let found = binarySearch m 24 (coerce x) & isJust
|
-- let found = binarySearch m 24 (coerce x) & isJust
|
||||||
|
@ -2058,43 +2131,97 @@ theDict = do
|
||||||
|
|
||||||
flip runContT pure do
|
flip runContT pure do
|
||||||
|
|
||||||
tnum <- getNumCapabilities
|
tnum' <- getNumCapabilities
|
||||||
|
|
||||||
let chunks = chunksOf (total `div` tnum) r
|
let tnum = if total < 100 then 0 else max 0 (floor (logBase 2 (realToFrac total)) - 1)
|
||||||
|
|
||||||
liftIO $ forConcurrently_ chunks $ \chunk -> flip runContT pure do
|
liftIO $ print $ red "TNUM" <+> pretty tnum <+> pretty total
|
||||||
|
|
||||||
|
queues <- replicateM (tnum+1) newTQueueIO <&> Vector.fromList
|
||||||
|
|
||||||
|
feeder <- ContT $ withAsync do
|
||||||
|
let balanced = zip (cycle [0..tnum]) r
|
||||||
|
for_ balanced $ \(i,c) -> atomically $ writeTQueue (queues ! i) (Just c)
|
||||||
|
atomically $ for_ queues (`writeTQueue` Nothing)
|
||||||
|
|
||||||
|
workers <- liftIO $ for [0..tnum] $ \i -> async $ flip runContT pure do
|
||||||
|
|
||||||
suff <- liftIO $ randomIO @Word32
|
|
||||||
theReader <- ContT $ withGitCat
|
theReader <- ContT $ withGitCat
|
||||||
ofile <- ContT $ withBinaryFile (show $ pretty "export-" <> pretty suff <>".log") AppendMode
|
|
||||||
|
|
||||||
void $ ContT $ bracket none (const $ stopProcess theReader)
|
void $ ContT $ bracket none (const $ stopProcess theReader)
|
||||||
for_ chunk $ \commit -> do
|
|
||||||
|
|
||||||
hashes <- gitReadTreeObjectsOnly commit
|
liftIO do
|
||||||
<&> (commit:)
|
fix \loop -> flip runContT pure do
|
||||||
>>= filterM notWrittenYet
|
suff <- liftIO $ randomIO @Word32
|
||||||
|
ofile <- ContT $ withBinaryFile (show $ pretty "export-" <> pretty suff <>".log") AppendMode
|
||||||
|
fix \loop2 -> do
|
||||||
|
atomically (readTQueue (queues ! i)) >>= \case
|
||||||
|
Nothing -> none
|
||||||
|
Just commit -> do
|
||||||
|
debug $ "write commit and shit" <+> pretty commit
|
||||||
|
|
||||||
for_ hashes $ \gh -> do
|
hashes <- gitReadTreeObjectsOnly commit
|
||||||
|
<&> (commit:)
|
||||||
|
>>= filterM notWrittenYet
|
||||||
|
|
||||||
(_t,lbs) <- gitReadObjectMaybe theReader gh
|
for_ hashes $ \gh -> do
|
||||||
>>= orThrow (GitReadError (show $ pretty gh))
|
|
||||||
|
|
||||||
let kbs = coerce @_ @BS.ByteString gh
|
(_t,lbs) <- gitReadObjectMaybe theReader gh
|
||||||
let keySize = BS.length kbs
|
>>= orThrow (GitReadError (show $ pretty gh))
|
||||||
|
|
||||||
-- debug $ pretty gh <+> pretty keySize
|
let kbs = coerce @_ @BS.ByteString gh
|
||||||
|
let keySize = BS.length kbs
|
||||||
|
|
||||||
let objectSize = LBS.length lbs & fromIntegral
|
let objectSize = LBS.length lbs & fromIntegral
|
||||||
let entrySize = fromIntegral $ keySize + objectSize
|
let entrySize = fromIntegral $ keySize + objectSize
|
||||||
let entry = LBS.toStrict $ Builder.toLazyByteString $ Builder.word32BE entrySize
|
|
||||||
|
|
||||||
liftIO do
|
let entry = mconcat [ Builder.word32BE entrySize
|
||||||
atomically $ modifyTVar _already (HS.insert gh)
|
, Builder.byteString kbs
|
||||||
debug $ "entry size" <+> pretty (BS.length entry) <+> pretty gh <+> pretty entrySize
|
, Builder.lazyByteString lbs
|
||||||
BS.hPutStr ofile entry
|
]
|
||||||
BS.hPutStr ofile kbs
|
|
||||||
LBS.hPutStr ofile lbs
|
atomically $ modifyTVar _already (HS.insert gh)
|
||||||
|
liftIO $ LBS.hPutStr ofile ( Builder.toLazyByteString entry )
|
||||||
|
|
||||||
|
loop2
|
||||||
|
|
||||||
|
mapM_ wait (feeder:workers)
|
||||||
|
|
||||||
|
|
||||||
|
-- let chunks = chunksOf (total `div` tnum) r
|
||||||
|
|
||||||
|
-- liftIO $ forConcurrently_ chunks $ \chunk -> flip runContT pure do
|
||||||
|
|
||||||
|
-- suff <- liftIO $ randomIO @Word32
|
||||||
|
-- theReader <- ContT $ withGitCat
|
||||||
|
-- ofile <- ContT $ withBinaryFile (show $ pretty "export-" <> pretty suff <>".log") AppendMode
|
||||||
|
|
||||||
|
-- void $ ContT $ bracket none (const $ stopProcess theReader)
|
||||||
|
-- for_ chunk $ \commit -> do
|
||||||
|
|
||||||
|
-- hashes <- gitReadTreeObjectsOnly commit
|
||||||
|
-- <&> (commit:)
|
||||||
|
-- >>= filterM notWrittenYet
|
||||||
|
|
||||||
|
-- for_ hashes $ \gh -> do
|
||||||
|
|
||||||
|
-- (_t,lbs) <- gitReadObjectMaybe theReader gh
|
||||||
|
-- >>= orThrow (GitReadError (show $ pretty gh))
|
||||||
|
|
||||||
|
-- let kbs = coerce @_ @BS.ByteString gh
|
||||||
|
-- let keySize = BS.length kbs
|
||||||
|
|
||||||
|
-- -- debug $ pretty gh <+> pretty keySize
|
||||||
|
|
||||||
|
-- let objectSize = LBS.length lbs & fromIntegral
|
||||||
|
-- let entrySize = fromIntegral $ keySize + objectSize
|
||||||
|
-- let entry = LBS.toStrict $ Builder.toLazyByteString $ Builder.word32BE entrySize
|
||||||
|
|
||||||
|
-- liftIO do
|
||||||
|
-- atomically $ modifyTVar _already (HS.insert gh)
|
||||||
|
-- debug $ "entry size" <+> pretty (BS.length entry) <+> pretty gh <+> pretty entrySize
|
||||||
|
-- BS.hPutStr ofile entry
|
||||||
|
-- BS.hPutStr ofile kbs
|
||||||
|
-- LBS.hPutStr ofile lbs
|
||||||
|
|
||||||
linearSearchLBS hash lbs = do
|
linearSearchLBS hash lbs = do
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue