This commit is contained in:
voidlizard 2025-01-08 12:01:16 +03:00
parent ff7abbefc7
commit 218ff12000
2 changed files with 241 additions and 95 deletions

View File

@ -93,7 +93,9 @@ import Data.Vector.Algorithms.Search qualified as MV
import UnliftIO.Concurrent import UnliftIO.Concurrent
import UnliftIO.IO.File qualified as UIO import UnliftIO.IO.File qualified as UIO
import Data.BloomFilter.Easy qualified as Bloom import Control.Monad.ST
import Data.BloomFilter qualified as Bloom
import Data.BloomFilter.Mutable qualified as MBloom
{- HLINT ignore "Functor law" -} {- HLINT ignore "Functor law" -}
{- HLINT ignore "Eta reduce" -} {- HLINT ignore "Eta reduce" -}
@ -146,6 +148,7 @@ recover m = fix \again -> do
liftIO $ withGit3Env connected again liftIO $ withGit3Env connected again
e -> throwIO e
--- ---
data TreeReadState = TreeReadState data TreeReadState = TreeReadState
@ -578,14 +581,6 @@ theDict = do
entry $ bindMatch "zlib:deflate" $ nil_ $ const $ liftIO do entry $ bindMatch "zlib:deflate" $ nil_ $ const $ liftIO do
LBS.hGetContents stdin <&> Zlib.decompress >>= LBS.hPutStr stdout LBS.hGetContents stdin <&> Zlib.decompress >>= LBS.hPutStr stdout
entry $ bindMatch "test:reflog:bloom:create" $ nil_ $ \syn -> lift do
r <- newTQueueIO
enumEntries $ \e -> do
atomically $ writeTQueue r (BS.take 20 e)
atomically (STM.flushTQueue r) <&> (Bloom.easyList 0.01)
notice $ "bloom filter build done"
entry $ bindMatch "test:git:read-commit-chain" $ nil_ $ \syn -> lift do entry $ bindMatch "test:git:read-commit-chain" $ nil_ $ \syn -> lift do
(mpath, hss) <- case syn of (mpath, hss) <- case syn of
[ HashLike s ] -> pure (Nothing, s) [ HashLike s ] -> pure (Nothing, s)
@ -705,7 +700,8 @@ theDict = do
entry $ bindMatch "test:reflog:index:search:binary:test:2" $ nil_ $ const $ lift do entry $ bindMatch "test:reflog:index:search:binary:test:2" $ nil_ $ const $ lift do
r <- newTQueueIO r <- newTQueueIO
enumEntries $ \e -> do idx <- openIndex
enumEntries idx $ \e -> do
let ha = GitHash $ coerce $ BS.take 20 e let ha = GitHash $ coerce $ BS.take 20 e
atomically $ writeTQueue r ha atomically $ writeTQueue r ha
@ -958,19 +954,16 @@ theDict = do
LBS.hPutStr fh contents LBS.hPutStr fh contents
entry $ bindMatch "test:git:reflog:index:list:fast" $ nil_ $ \case entry $ bindMatch "test:git:reflog:index:list:fast" $ nil_ $ const $ lift do
[ StringLike f ] -> lift do files <- listObjectIndexFiles
forConcurrently_ files $ \(f,_) -> do
bs <- liftIO $ mmapFileByteString f Nothing bs <- liftIO $ mmapFileByteString f Nothing
scanBS bs $ \segment -> do scanBS bs $ \segment -> do
none let (sha1,blake) = BS.splitAt 20 segment
-- let (sha1,blake) = BS.splitAt 20 segment & over _1 (coerce @_ @GitHash)
-- & over _1 (coerce @_ @GitHash) & over _2 (coerce @_ @HashRef)
-- & over _2 (coerce @_ @HashRef)
-- notice $ pretty sha1 <+> pretty blake notice $ pretty sha1 <+> pretty blake
liftIO $ print $ pretty "okay"
_ -> throwIO (BadFormException @C nil)
entry $ bindMatch "reflog:index:list" $ nil_ $ const $ lift do entry $ bindMatch "reflog:index:list" $ nil_ $ const $ lift do
files <- listObjectIndexFiles files <- listObjectIndexFiles
@ -1068,7 +1061,6 @@ theDict = do
entry $ bindMatch "reflog:index:path" $ nil_ $ const $ lift do entry $ bindMatch "reflog:index:path" $ nil_ $ const $ lift do
indexPath >>= liftIO . print . pretty indexPath >>= liftIO . print . pretty
-- let entriesListOf lbs = S.toList_ $ runConsumeLBS lbs $ readSections $ \s ss -> do -- let entriesListOf lbs = S.toList_ $ runConsumeLBS lbs $ readSections $ \s ss -> do
entry $ bindMatch "reflog:index:files" $ nil_ $ \syn -> lift do entry $ bindMatch "reflog:index:files" $ nil_ $ \syn -> lift do
files <- listObjectIndexFiles files <- listObjectIndexFiles
@ -1079,7 +1071,8 @@ theDict = do
entry $ bindMatch "reflog:index:list:tx" $ nil_ $ const $ lift do entry $ bindMatch "reflog:index:list:tx" $ nil_ $ const $ lift do
r <- newTVarIO ( mempty :: HashSet HashRef ) r <- newTVarIO ( mempty :: HashSet HashRef )
enumEntries $ \bs -> do index <- openIndex
enumEntries index $ \bs -> do
atomically $ modifyTVar r (HS.insert (coerce $ BS.take 32 $ BS.drop 20 bs)) atomically $ modifyTVar r (HS.insert (coerce $ BS.take 32 $ BS.drop 20 bs))
z <- readTVarIO r <&> HS.toList z <- readTVarIO r <&> HS.toList
liftIO $ mapM_ ( print . pretty ) z liftIO $ mapM_ ( print . pretty ) z
@ -1087,67 +1080,33 @@ theDict = do
entry $ bindMatch "reflog:index:build" $ nil_ $ const $ lift $ connectedDo do entry $ bindMatch "reflog:index:build" $ nil_ $ const $ lift $ connectedDo do
writeReflogIndex writeReflogIndex
entry $ bindMatch "git:list:objects:new" $ nil_ $ \syn -> lift do entry $ bindMatch "test:reflog:index:lookup" $ nil_ \case
let (opts,argz) = splitOpts [] syn [ GitHashLike h ] -> lift do
idx <- openIndex
what <- indexEntryLookup idx h >>= orThrowUser "object not found"
liftIO $ print $ pretty ( coerce @_ @HashRef what )
let what = headDef "HEAD" [ x | StringLike x <- argz ] _ -> throwIO (BadFormException @C nil)
h0 <- gitRevParseThrow what
void $ flip runContT pure do entry $ bindMatch "git:commit:list:objects:new" $ nil_ $ \case
[ StringLike what ] -> lift do
rq <- newTQueueIO commit <- gitRevParseThrow what
-- ContT $ withAsync (startReflogIndexQueryQueue rq)
-- let req h = do idx <- openIndex
-- let bs = coerce @GitHash @N.ByteString h
-- let tr = const True
-- w <- newEmptyTMVarIO
-- atomically $ writeTQueue rq (bs, tr, w)
-- r <- atomically $ readTMVar w
-- pure $ isNothing r
cache <- newTVarIO ( mempty :: HashSet GitHash ) let req h = lift $ indexEntryLookup idx h <&> isNothing
-- читаем вообще всё из индекса в память и строим HashSet flip runContT pure do
-- получается, что вообще никакого профита, что это индекс, cap <- liftIO getNumCapabilities
-- это фуллскан в любом случае. gitCatBatchQ <- contWorkerPool cap do
-- Индекс это сортированная последовательность [(GitHash, HashRef)] che <- ContT withGitCat
-- в виде байстроки формата "SD", D ~ GitHash <> HashRef pure $ gitReadObjectMaybe che
--
--
(t,_) <- timeItT do new_ <- newTQueueIO
lift $ enumEntries $ \e -> do c1 <- newCacheFixedHPSQ 1000
atomically do
modifyTVar cache ( HS.insert (coerce $ BS.take 20 e) )
s <- readTVarIO cache <&> HS.size (_,self) <- lift $ gitCatBatchQ commit
notice $ pretty s <+> " records read at" <+> pretty t
l_ <- newTVarIO 0
h_ <- newTVarIO ( mempty :: HashSet GitHash )
s_ <- newTVarIO ( mempty :: HashMap GitHash Int )
let req h = do
atomically do
modifyTVar l_ succ
readTVar cache <&> not . HS.member h
-- читаем только те коммиты, которые не в индексе
-- очень быстро, пушо относительно мало объектов
(t1,r) <- timeItT (lift $ readCommitChainHPSQ req Nothing h0 dontHandle)
notice $ pretty s <+> " new commits read at" <+> pretty (realToFrac @_ @(Fixed E3) t1)
cap <- liftIO getNumCapabilities
gitCatBatchQ <- contWorkerPool cap do
che <- ContT withGitCat
pure $ gitReadObjectMaybe che
new_ <- newTQueueIO
c1 <- newCacheFixedHPSQ 1000
(t3, _) <- timeItT $ lift $ forConcurrently_ (HPSQ.toList r) $ \(commit,_,_) -> do
(_,self) <- gitCatBatchQ commit
>>= orThrow (GitReadError (show $ pretty commit)) >>= orThrow (GitReadError (show $ pretty commit))
tree <- gitReadCommitTree self tree <- gitReadCommitTree self
@ -1157,23 +1116,121 @@ theDict = do
<&> ([commit,tree]<>) <&> ([commit,tree]<>)
>>= filterM req >>= filterM req
-- --
atomically $ modifyTVar s_ (HM.insertWith (\old new -> max old new) commit (length hashes))
atomically $ modifyTVar h_ (HS.union (HS.fromList hashes))
atomically $ mapM_ (writeTQueue new_) hashes atomically $ mapM_ (writeTQueue new_) hashes
atomically (STM.flushTQueue new_) >>= liftIO . print . pretty . length
-- 1.8 секунд и заметно растёт от числа коммитов, сука _ -> throwIO (BadFormException @C nil)
atomically (STM.flushTQueue new_) >>= liftIO . print . pretty . length
l <- readTVarIO l_ entry $ bindMatch "git:list:objects:new" $ nil_ $ \syn -> lift do
n <- readTVarIO h_ let (opts,argz) = splitOpts [] syn
w <- readTVarIO s_ <&> HM.elems
let a = realToFrac (sum w) / realToFrac (length w) let what = headDef "HEAD" [ x | StringLike x <- argz ]
h0 <- gitRevParseThrow what
notice $ pretty l <+> pretty (HS.size n) <+> "done in " <+> pretty (realToFrac @_ @(Fixed E3) t3) no_ <- newTVarIO 0
notice $ "avg per commit" <+> pretty a
void $ flip runContT pure do
-- cache <- newTVarIO ( mempty :: HashSet GitHash )
-- читаем вообще всё из индекса в память и строим HashSet
-- получается, что вообще никакого профита, что это индекс,
-- это фуллскан в любом случае.
-- Индекс это сортированная последовательность [(GitHash, HashRef)]
-- в виде байстроки формата "SD", D ~ GitHash <> HashRef
-- let blm = runST (MBloom.new undefined 1000000)
--
-- bloom <- liftIO $ stToIO $ MBloom.new 10000
--
index <- lift openIndex
-- let req h = do
-- atomically do
-- readTVar cache <&> not . HS.member h
-- let
-- req2 :: GitHash -> Git3 m Bool
-- req2 h = liftIO do
-- here <- liftIO $ stToIO $ MBloom.elem h bloom
-- if not here then pure True else do
-- atomically $ modifyTVar blmn_ succ
-- forConcurrently_ files $ \f -> do
-- found <- binarySearchBS 56 ( BS.take 20. BS.drop 4 ) (coerce h) f
-- when (isJust found) do
-- atomically $ modifyTVar excl_ (HS.insert h)
-- readTVarIO excl_ <&> not . HS.member h
-- req3 :: HashSet GitHash -> Git3 m (HashSet GitHash)
-- req3 hs = liftIO do
-- forConcurrently_ files $ \f -> do
-- flip fix (HS.toList hs) $ \next -> \case
-- [] -> none
-- (x:xs) -> do
-- already <- readTVarIO excl_ <&> HS.member x
-- inBloom <- liftIO $ stToIO $ MBloom.elem x bloom
-- when inBloom do
-- atomically $ modifyTVar blmn_ succ
-- when (not already || inBloom) do
-- found <- binarySearchBS 56 ( BS.take 20. BS.drop 4 ) (coerce x) f
-- when (isJust found) do
-- atomically $ modifyTVar excl_ (HS.insert x)
-- next xs
-- found <- readTVarIO excl_
-- pure ( hs `HS.difference` found)
-- читаем только те коммиты, которые не в индексе
-- очень быстро, пушо относительно мало объектов
idx <- lift openIndex
let req h = lift $ indexEntryLookup idx h <&> isNothing
(t1,r) <- timeItT (lift $ readCommitChainHPSQ req Nothing h0 dontHandle)
let s = HPSQ.size r
notice $ pretty s <+> "new commits read at" <+> pretty (realToFrac @_ @(Fixed E3) t1)
cap <- liftIO getNumCapabilities
gitCatBatchQ <- contWorkerPool cap do
che <- ContT withGitCat
pure $ gitReadObjectMaybe che
uniq_ <- newTVarIO mempty
-- c1 <- newCacheFixedHPSQ 1000
(t3, _) <- timeItT $ lift $ forConcurrently_ (HPSQ.toList r) $ \(commit,_,_) -> do
(_,self) <- gitCatBatchQ commit
>>= orThrow (GitReadError (show $ pretty commit))
tree <- gitReadCommitTree self
-- читаем только те объекты, которые не в индексе
gitReadTreeObjectsOnly commit
<&> ([commit,tree]<>)
>>= \hs -> atomically (for_ hs (modifyTVar uniq_ . HS.insert))
notice $ "all shit read" <+> pretty (realToFrac @_ @(Fixed E2) t3)
(t4,new) <- lift $ timeItT $ readTVarIO uniq_ >>= filterM req . HS.toList
notice $ pretty (length new) <+> "new objects" <+> "at" <+> pretty (realToFrac @_ @(Fixed E2) t4)
-- x <- readTVarIO uniq_ <&> HS.size
-- blmn <- readTVarIO blmn_
-- notice $ "all shit filter" <+> parens (pretty x) <+> brackets (pretty blmn) <+> pretty (realToFrac @_ @(Fixed E2) t4)
-- notice $ pretty (length new)
-- notice $ "total objects" <+> pretty
-- notice $ "present" <+> pretty nhere
-- liftIO $ print $ pretty (HS -- liftIO $ print $ pretty (HS

View File

@ -17,9 +17,21 @@ import Data.ByteString.Lazy qualified as LBS
import Data.Maybe import Data.Maybe
import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HM import Data.HashMap.Strict qualified as HM
import Data.Word
import Data.Vector (Vector)
import Data.Vector qualified as V
import Data.Kind
import Data.BloomFilter qualified as Bloom
import Data.BloomFilter (Bloom(..))
import Data.BloomFilter.Mutable qualified as MBloom
import Control.Monad.ST
import Control.Concurrent.STM qualified as STM import Control.Concurrent.STM qualified as STM
import Codec.Compression.Zstd.Lazy qualified as ZstdL import Codec.Compression.Zstd.Lazy qualified as ZstdL
import Codec.Serialise
import Streaming.Prelude qualified as S import Streaming.Prelude qualified as S
import Streaming hiding (run,chunksOf) import Streaming hiding (run,chunksOf)
@ -59,6 +71,46 @@ indexPath = do
reflog <- getGitRemoteKey >>= orThrow Git3ReflogNotSet reflog <- getGitRemoteKey >>= orThrow Git3ReflogNotSet
getStatePath (AsBase58 reflog) <&> (</> "index") getStatePath (AsBase58 reflog) <&> (</> "index")
data IndexEntry =
IndexEntry
{ entryFile :: FilePath
, entryBS :: N.ByteString
}
data Index a =
Index { entries :: [IndexEntry]
, bitmap :: MBloom.MBloom RealWorld GitHash
}
openIndex :: forall a m . (Git3Perks m, MonadReader Git3Env m)
=> m (Index a)
openIndex = do
files <- listObjectIndexFiles
bss <- liftIO $ for files $ \(f,_) -> (f,) <$> mmapFileByteString f Nothing
let entries = [ IndexEntry f bs | (f,bs) <- bss ]
bloom <- liftIO $ stToIO $ MBloom.new bloomHash 10
pure $ Index entries bloom
indexEntryLookup :: forall a m . (Git3Perks m)
=> Index a
-> GitHash
-> m (Maybe N.ByteString)
indexEntryLookup Index{..} h = do
already_ <- newTVarIO ( mempty :: HashMap GitHash N.ByteString )
forConcurrently_ entries $ \IndexEntry{..} -> do
what <- readTVarIO already_ <&> HM.lookup h
case what of
Just{} -> none
Nothing -> do
offset' <- binarySearchBS 56 ( BS.take 20 . BS.drop 4 ) (coerce h) entryBS
maybe1 offset' none $ \offset -> do
let ebs = BS.take 32 $ BS.drop (offset + 4 + 20) entryBS
atomically $ modifyTVar already_ (HM.insert h ebs)
readTVarIO already_ <&> headMay . HM.elems
listObjectIndexFiles :: forall m . ( Git3Perks m listObjectIndexFiles :: forall m . ( Git3Perks m
, MonadReader Git3Env m , MonadReader Git3Env m
) => m [(FilePath, Natural)] ) => m [(FilePath, Natural)]
@ -72,16 +124,23 @@ listObjectIndexFiles = do
pure (f,z) pure (f,z)
enumEntries :: forall a m . ( Git3Perks m
enumEntries :: forall m . ( Git3Perks m
, MonadReader Git3Env m , MonadReader Git3Env m
) => ( BS.ByteString -> m () ) -> m () ) => Index a -> ( BS.ByteString -> m () ) -> m ()
enumEntries action = do enumEntries Index{..} action = do
files <- listObjectIndexFiles <&> fmap fst forConcurrently_ entries $ \IndexEntry{..} -> do
forConcurrently_ files $ \f -> do scanBS entryBS action
bs <- liftIO $ mmapFileByteString f Nothing
scanBS bs action bloomHash :: GitHash -> [Word32]
bloomHash gh = [a,b,c,d,e]
where
bs = coerce gh
a = N.word32 (BS.take 4 bs)
b = N.word32 (BS.take 4 $ BS.drop 4 bs)
c = N.word32 (BS.take 4 $ BS.drop 8 bs)
d = N.word32 (BS.take 4 $ BS.drop 12 bs)
e = N.word32 (BS.take 4 $ BS.drop 16 bs)
startReflogIndexQueryQueue :: forall a m . ( Git3Perks m startReflogIndexQueryQueue :: forall a m . ( Git3Perks m
, MonadReader Git3Env m , MonadReader Git3Env m
@ -134,6 +193,13 @@ startReflogIndexQueryQueue rq = flip runContT pure do
rest <- readTVar answQ rest <- readTVar answQ
for_ rest $ \x -> writeTMVar x Nothing for_ rest $ \x -> writeTMVar x Nothing
bloomFilterSize :: Natural -> Natural -> Double -> Natural
bloomFilterSize n k p
| p <= 0 || p >= 1 = 0
| otherwise = rnd $ negate (fromIntegral n * fromIntegral k) / log (1 - p ** (1 / fromIntegral k))
where
rnd x = 2 ** realToFrac (ceiling (logBase 2 x)) & round
writeReflogIndex :: forall m . ( Git3Perks m writeReflogIndex :: forall m . ( Git3Perks m
, MonadReader Git3Env m , MonadReader Git3Env m
, HasClientAPI PeerAPI UNIX m , HasClientAPI PeerAPI UNIX m
@ -195,4 +261,27 @@ writeReflogIndex = do
-- notice $ pretty sha1 <+> pretty tx -- notice $ pretty sha1 <+> pretty tx
writeSection ( LBS.fromChunks [key,value] ) (LBS.hPutStr wh) writeSection ( LBS.fromChunks [key,value] ) (LBS.hPutStr wh)
-- files <- lift listObjectIndexFiles
-- let num = sum (fmap snd files) `div` 56
-- let size = bloomFilterSize num 5 0.01
-- bloom <- liftIO $ stToIO (MBloom.new bloomHash (fromIntegral size))
-- lift $ enumEntries $ \bs -> do
-- liftIO $ stToIO $ MBloom.insert bloom (coerce bs)
-- let bloomIdxName = idxPath </> "filter"
-- bytes <- liftIO $ stToIO $ Bloom.freeze bloom
-- liftIO $ UIO.withBinaryFileAtomic bloomIdxName WriteMode $ \wh -> do
-- LBS.hPutStr wh "puk"
-- LBS.hPutStr wh (serialise bytes)
-- LBS.writeFile (serialise b
-- for_ ss $ \sha1 -> do
-- let key = coerce @_ @N.ByteString sha1
-- let value = coerce @_ @N.ByteString tx
-- -- notice $ pretty sha1 <+> pretty tx
-- writeSection ( LBS.fromChunks [key,value] ) (LBS.hPutStr wh)