mirror of https://github.com/voidlizard/hbs2
wip
This commit is contained in:
parent
050603f82b
commit
7f344a7f72
|
@ -51,6 +51,7 @@ import Codec.Compression.Zstd qualified as Zstd
|
|||
import Codec.Compression.Zstd.Streaming qualified as ZstdS
|
||||
import Codec.Compression.Zstd.Streaming (Result(..))
|
||||
import Codec.Compression.Zstd (maxCLevel)
|
||||
import Codec.Compression.Zstd.Lazy qualified as ZstdL
|
||||
|
||||
import Codec.Compression.Zlib qualified as Zlib
|
||||
|
||||
|
@ -1288,17 +1289,37 @@ consumed = gets LBS.null
|
|||
runConsumeLBS :: Monad m => ByteString -> ConsumeLBS m a -> m a
|
||||
runConsumeLBS s m = evalStateT (fromConsumeLBS m) s
|
||||
|
||||
newtype ConsumeBS m a = ConsumeBS { fromConsumeBS :: StateT BS.ByteString m a }
|
||||
deriving newtype ( Applicative
|
||||
, Functor
|
||||
, Monad
|
||||
, MonadState BS.ByteString
|
||||
, MonadIO
|
||||
, MonadTrans
|
||||
)
|
||||
|
||||
|
||||
instance Monad m => BytesReader (ConsumeLBS m) where
|
||||
readBytes n = readChunkSimple n
|
||||
noBytesLeft = consumed
|
||||
|
||||
readLogFileLBS :: forall opts m . ( MonadIO m, ReadLogOpts opts )
|
||||
instance Monad m => BytesReader (ConsumeBS m) where
|
||||
noBytesLeft = gets BS.null
|
||||
readBytes n = do
|
||||
s <- get
|
||||
let (a,b) = BS.splitAt n s
|
||||
put $! b
|
||||
pure (LBS.fromStrict a)
|
||||
|
||||
runConsumeBS :: Monad m => BS.ByteString -> ConsumeBS m a -> m a
|
||||
runConsumeBS s m = evalStateT (fromConsumeBS m) s
|
||||
|
||||
readLogFileLBS :: forall opts m . ( MonadIO m, ReadLogOpts opts, BytesReader m )
|
||||
=> opts
|
||||
-> ByteString
|
||||
-> ( GitHash -> Int -> ByteString -> m () )
|
||||
-> m Int
|
||||
|
||||
readLogFileLBS _ lbs action = runConsumeLBS lbs $ flip fix 0 \go n -> do
|
||||
readLogFileLBS _ action = flip fix 0 \go n -> do
|
||||
done <- noBytesLeft
|
||||
if done then pure n
|
||||
else do
|
||||
|
@ -1313,7 +1334,7 @@ readLogFileLBS _ lbs action = runConsumeLBS lbs $ flip fix 0 \go n -> do
|
|||
sdata <- readBytesMaybe ( ssize - 20 )
|
||||
>>= orThrow SomeReadLogError
|
||||
|
||||
void $ lift $ action hash (fromIntegral ssize) sdata
|
||||
void $ action hash (fromIntegral ssize) sdata
|
||||
go (succ n)
|
||||
|
||||
-- FIXME: move-to-suckless-script
|
||||
|
@ -1799,52 +1820,102 @@ theDict = do
|
|||
|
||||
r <- S.toList_ $ for_ files $ \f -> do
|
||||
lbs <- liftIO $ LBS.readFile f
|
||||
readLogFileLBS () lbs $ \h _ _ -> do
|
||||
when (h == what) (S.yield f)
|
||||
runConsumeLBS lbs $ readLogFileLBS () $ \h _ _ -> do
|
||||
when (h == what) (lift $ S.yield f)
|
||||
|
||||
for_ (HS.fromList r) $ \x -> do
|
||||
liftIO $ print x
|
||||
|
||||
entry $ bindMatch "test:git:zstd:packed:list" $ nil_ $ \case
|
||||
[StringLike fn] -> lift do
|
||||
lbs <- liftIO$ LBS.readFile fn
|
||||
|
||||
runConsumeLBS (ZstdL.decompress lbs) $ readLogFileLBS () $ \h s _ -> do
|
||||
debug $ "object" <+> pretty h <+> pretty s
|
||||
|
||||
_ -> throwIO (BadFormException @C nil)
|
||||
|
||||
entry $ bindMatch "test:git:zstd:pack" $ nil_ $ \syn -> do
|
||||
|
||||
let (opts,argz) = splitOpts [("-l",1)] syn
|
||||
|
||||
fn <- headMay [ x | StringLike x <- argz ] & orThrowUser "filename not set"
|
||||
let l = headDef 5 [ fromIntegral l | ListVal [StringLike "-l", LitIntVal l ] <- opts ]
|
||||
|
||||
file <- liftIO $ LBS.readFile fn
|
||||
let z = ZstdL.compress l file
|
||||
liftIO $ LBS.writeFile (fn <> ".z") z
|
||||
|
||||
entry $ bindMatch "test:git:zstd:train" $ nil_ $ \case
|
||||
[ StringLike fn ] -> do
|
||||
[ StringLike fn ] -> lift do
|
||||
file <- liftIO $ mmapFileByteString fn Nothing
|
||||
pure ()
|
||||
|
||||
_total <- newTVarIO 0
|
||||
|
||||
samples' <- S.toList_ ( runConsumeBS file $ readLogFileLBS () $ \h s lbs -> do
|
||||
|
||||
atomically $ modifyTVar _total succ
|
||||
lift $ S.yield (LBS.toStrict (LBS.take (256*1024) lbs)) )
|
||||
|
||||
let samples = [x | (1,x) <- zip (cycle [1..10]) samples' ]
|
||||
|
||||
dictionary <- Zstd.trainFromSamples (256 * 1024) samples
|
||||
& orThrowUser "can't train dictionary"
|
||||
|
||||
debug $ "dict size" <+> pretty (BS.length $ Zstd.fromDict dictionary)
|
||||
|
||||
flip runContT pure do
|
||||
|
||||
ContT $ withAsync $ forever do
|
||||
pause @'Seconds 1
|
||||
p <- readTVarIO _total
|
||||
liftIO $ IO.hPutStr stderr (" \r" <> show (pretty p))
|
||||
|
||||
fh <- ContT $ withBinaryFile (fn <> ".packed") AppendMode
|
||||
void $ runConsumeBS file $ readLogFileLBS () $ \h s lbs -> do
|
||||
|
||||
let packed = Zstd.compressUsingDict dictionary maxCLevel (LBS.toStrict lbs)
|
||||
-- let packed = Zstd.compress maxCLevel (LBS.toStrict lbs)
|
||||
|
||||
let kbs = coerce @_ @BS.ByteString h
|
||||
let keySize = BS.length kbs
|
||||
|
||||
let objectSize = BS.length packed & fromIntegral
|
||||
let entrySize = fromIntegral $ keySize + objectSize
|
||||
let entry = LBS.toStrict $ Builder.toLazyByteString $ Builder.word32BE entrySize
|
||||
|
||||
liftIO do
|
||||
atomically $ modifyTVar _total pred
|
||||
-- debug $ "entry size" <+> pretty (BS.length entry) <+> pretty h <+> pretty entrySize
|
||||
BS.hPutStr fh entry
|
||||
BS.hPutStr fh kbs
|
||||
BS.hPutStr fh packed
|
||||
|
||||
|
||||
_ -> throwIO (BadFormException @C nil)
|
||||
|
||||
entry $ bindMatch "test:git:read-log-file" $ nil_ $ \syn -> lift do
|
||||
let (_, argz) = splitOpts [] syn
|
||||
fname <- headMay [ x | StringLike x <- argz] & orThrowUser "log file not set"
|
||||
flip runContT pure do
|
||||
h <- ContT $ bracket (openFile fname ReadMode) hClose
|
||||
|
||||
fix \next -> do
|
||||
eof <- hIsEOF h
|
||||
unless eof do
|
||||
size <- liftIO (BS.hGet h 4) <&> fromIntegral . N.word32
|
||||
debug $ "size" <+> pretty size
|
||||
bshash <- liftIO (BS.hGet h 20) <&> GitHash
|
||||
hSeek h RelativeSeek (size - 20)
|
||||
-- lbs <- liftIO (LBS.hGet h (size - 20))
|
||||
debug $ "object" <+> pretty bshash
|
||||
next
|
||||
file <- liftIO $ mmapFileByteString fname Nothing
|
||||
void $ runConsumeBS file $ readLogFileLBS () $ \h s lbs -> do
|
||||
debug $ "object" <+> pretty h <+> pretty s
|
||||
|
||||
entry $ bindMatch "test:git:read-log-lbs" $ nil_ $ \syn -> lift do
|
||||
let (_, argz) = splitOpts [] syn
|
||||
fname <- headMay [ x | StringLike x <- argz] & orThrowUser "log file not set"
|
||||
theLog <- liftIO $ LBS.readFile fname
|
||||
void $ readLogFileLBS () theLog $ \h s lbs -> do
|
||||
void $ runConsumeLBS theLog $ readLogFileLBS () $ \h s lbs -> do
|
||||
debug $ "object" <+> pretty h
|
||||
|
||||
|
||||
entry $ bindMatch "test:git:log:index:naive:dump" $ nil_ $ \syn -> lift do
|
||||
let (_, argz) = splitOpts [] syn
|
||||
fname <- headMay [ x | StringLike x <- argz] & orThrowUser "no file"
|
||||
|
||||
lbs <- liftIO $ LBS.readFile fname
|
||||
bs <- liftIO $ mmapFileByteString fname Nothing
|
||||
|
||||
runConsumeLBS lbs $ flip fix 0 \go n -> do
|
||||
done <- consumed
|
||||
runConsumeBS bs $ flip fix 0 \go n -> do
|
||||
done <- noBytesLeft
|
||||
if done then pure ()
|
||||
else do
|
||||
ssize <- readBytesMaybe 4
|
||||
|
@ -2015,8 +2086,8 @@ theDict = do
|
|||
for_ fnames $ \f -> do
|
||||
theLog <- liftIO $ LBS.readFile f
|
||||
|
||||
void $ readLogFileLBS () theLog $ \h s lbs -> do
|
||||
S.yield (coerce @_ @BS.ByteString h)
|
||||
void $ runConsumeLBS theLog $ readLogFileLBS () $ \h s lbs -> do
|
||||
lift $ S.yield (coerce @_ @BS.ByteString h)
|
||||
debug $ "object" <+> pretty h
|
||||
|
||||
let sorted = Set.toList $ Set.fromList all
|
||||
|
@ -2047,7 +2118,9 @@ theDict = do
|
|||
|
||||
_already <- newTVarIO mempty
|
||||
|
||||
let notWrittenYet x = do
|
||||
let
|
||||
notWrittenYet :: forall m . MonadIO m => GitHash -> m Bool
|
||||
notWrittenYet x = do
|
||||
already <- readTVarIO _already <&> HS.member x
|
||||
-- alsoInIdx <- maybe1 mmaped (pure False) $ \m -> do
|
||||
-- let found = binarySearch m 24 (coerce x) & isJust
|
||||
|
@ -2058,18 +2131,33 @@ theDict = do
|
|||
|
||||
flip runContT pure do
|
||||
|
||||
tnum <- getNumCapabilities
|
||||
tnum' <- getNumCapabilities
|
||||
|
||||
let chunks = chunksOf (total `div` tnum) r
|
||||
let tnum = if total < 100 then 0 else max 0 (floor (logBase 2 (realToFrac total)) - 1)
|
||||
|
||||
liftIO $ forConcurrently_ chunks $ \chunk -> flip runContT pure do
|
||||
liftIO $ print $ red "TNUM" <+> pretty tnum <+> pretty total
|
||||
|
||||
queues <- replicateM (tnum+1) newTQueueIO <&> Vector.fromList
|
||||
|
||||
feeder <- ContT $ withAsync do
|
||||
let balanced = zip (cycle [0..tnum]) r
|
||||
for_ balanced $ \(i,c) -> atomically $ writeTQueue (queues ! i) (Just c)
|
||||
atomically $ for_ queues (`writeTQueue` Nothing)
|
||||
|
||||
workers <- liftIO $ for [0..tnum] $ \i -> async $ flip runContT pure do
|
||||
|
||||
suff <- liftIO $ randomIO @Word32
|
||||
theReader <- ContT $ withGitCat
|
||||
ofile <- ContT $ withBinaryFile (show $ pretty "export-" <> pretty suff <>".log") AppendMode
|
||||
|
||||
void $ ContT $ bracket none (const $ stopProcess theReader)
|
||||
for_ chunk $ \commit -> do
|
||||
|
||||
liftIO do
|
||||
fix \loop -> flip runContT pure do
|
||||
suff <- liftIO $ randomIO @Word32
|
||||
ofile <- ContT $ withBinaryFile (show $ pretty "export-" <> pretty suff <>".log") AppendMode
|
||||
fix \loop2 -> do
|
||||
atomically (readTQueue (queues ! i)) >>= \case
|
||||
Nothing -> none
|
||||
Just commit -> do
|
||||
debug $ "write commit and shit" <+> pretty commit
|
||||
|
||||
hashes <- gitReadTreeObjectsOnly commit
|
||||
<&> (commit:)
|
||||
|
@ -2083,18 +2171,57 @@ theDict = do
|
|||
let kbs = coerce @_ @BS.ByteString gh
|
||||
let keySize = BS.length kbs
|
||||
|
||||
-- debug $ pretty gh <+> pretty keySize
|
||||
|
||||
let objectSize = LBS.length lbs & fromIntegral
|
||||
let entrySize = fromIntegral $ keySize + objectSize
|
||||
let entry = LBS.toStrict $ Builder.toLazyByteString $ Builder.word32BE entrySize
|
||||
|
||||
liftIO do
|
||||
let entry = mconcat [ Builder.word32BE entrySize
|
||||
, Builder.byteString kbs
|
||||
, Builder.lazyByteString lbs
|
||||
]
|
||||
|
||||
atomically $ modifyTVar _already (HS.insert gh)
|
||||
debug $ "entry size" <+> pretty (BS.length entry) <+> pretty gh <+> pretty entrySize
|
||||
BS.hPutStr ofile entry
|
||||
BS.hPutStr ofile kbs
|
||||
LBS.hPutStr ofile lbs
|
||||
liftIO $ LBS.hPutStr ofile ( Builder.toLazyByteString entry )
|
||||
|
||||
loop2
|
||||
|
||||
mapM_ wait (feeder:workers)
|
||||
|
||||
|
||||
-- let chunks = chunksOf (total `div` tnum) r
|
||||
|
||||
-- liftIO $ forConcurrently_ chunks $ \chunk -> flip runContT pure do
|
||||
|
||||
-- suff <- liftIO $ randomIO @Word32
|
||||
-- theReader <- ContT $ withGitCat
|
||||
-- ofile <- ContT $ withBinaryFile (show $ pretty "export-" <> pretty suff <>".log") AppendMode
|
||||
|
||||
-- void $ ContT $ bracket none (const $ stopProcess theReader)
|
||||
-- for_ chunk $ \commit -> do
|
||||
|
||||
-- hashes <- gitReadTreeObjectsOnly commit
|
||||
-- <&> (commit:)
|
||||
-- >>= filterM notWrittenYet
|
||||
|
||||
-- for_ hashes $ \gh -> do
|
||||
|
||||
-- (_t,lbs) <- gitReadObjectMaybe theReader gh
|
||||
-- >>= orThrow (GitReadError (show $ pretty gh))
|
||||
|
||||
-- let kbs = coerce @_ @BS.ByteString gh
|
||||
-- let keySize = BS.length kbs
|
||||
|
||||
-- -- debug $ pretty gh <+> pretty keySize
|
||||
|
||||
-- let objectSize = LBS.length lbs & fromIntegral
|
||||
-- let entrySize = fromIntegral $ keySize + objectSize
|
||||
-- let entry = LBS.toStrict $ Builder.toLazyByteString $ Builder.word32BE entrySize
|
||||
|
||||
-- liftIO do
|
||||
-- atomically $ modifyTVar _already (HS.insert gh)
|
||||
-- debug $ "entry size" <+> pretty (BS.length entry) <+> pretty gh <+> pretty entrySize
|
||||
-- BS.hPutStr ofile entry
|
||||
-- BS.hPutStr ofile kbs
|
||||
-- LBS.hPutStr ofile lbs
|
||||
|
||||
linearSearchLBS hash lbs = do
|
||||
|
||||
|
|
Loading…
Reference in New Issue