commit traversal with level, does not suck much

This commit is contained in:
voidlizard 2024-12-22 10:03:37 +03:00
parent b066868965
commit 8804450d7f
1 changed files with 129 additions and 7 deletions

View File

@ -59,6 +59,8 @@ import Data.HashPSQ qualified as HPSQ
import Data.HashPSQ (HashPSQ) import Data.HashPSQ (HashPSQ)
import Data.Maybe import Data.Maybe
import Data.List qualified as L import Data.List qualified as L
import Data.List (sortBy)
import Data.List.Split (chunksOf)
import Data.ByteString.Lazy.Char8 qualified as LBS8 import Data.ByteString.Lazy.Char8 qualified as LBS8
import Data.ByteString.Lazy qualified as LBS import Data.ByteString.Lazy qualified as LBS
import Data.ByteString qualified as BS import Data.ByteString qualified as BS
@ -75,6 +77,8 @@ import Data.HashSet (HashSet(..))
import Data.HashMap.Strict qualified as HM import Data.HashMap.Strict qualified as HM
import Data.HashMap.Strict (HashMap(..)) import Data.HashMap.Strict (HashMap(..))
import Data.Word import Data.Word
import Data.Fixed
import Data.Ord (comparing)
import Data.Generics.Labels import Data.Generics.Labels
import Data.Generics.Product import Data.Generics.Product
import Lens.Micro.Platform import Lens.Micro.Platform
@ -99,9 +103,12 @@ import Data.Either
import Data.Coerce import Data.Coerce
import Data.Kind import Data.Kind
import Data.List (sortOn) import Data.List (sortOn)
import Data.Vector qualified as Vector
import Data.Vector ((!))
import Data.Ord (Down(..)) import Data.Ord (Down(..))
import UnliftIO import UnliftIO
import UnliftIO.Concurrent
import UnliftIO.IO.File qualified as UIO import UnliftIO.IO.File qualified as UIO
{- HLINT ignore "Functor law" -} {- HLINT ignore "Functor law" -}
@ -160,6 +167,15 @@ isGitLsTreeEntry = \case
_ -> Nothing _ -> Nothing
gitReadTreeObjectsOnly :: (Pretty what, MonadIO m) => what -> m [GitHash]
gitReadTreeObjectsOnly what =
gitRunCommand [qc|git ls-tree -t -r --object-only {pretty what}|]
>>= orThrow (GitReadError (show $ pretty what))
<&> fmap LBS8.words . LBS8.lines
<&> mapMaybe \case
[ x ] -> fromStringMay @GitHash (LBS8.unpack x)
_ -> Nothing
gitReadTree :: (Pretty what, MonadIO m) => what -> m [GitTreeEntry] gitReadTree :: (Pretty what, MonadIO m) => what -> m [GitTreeEntry]
gitReadTree what = gitReadTree what =
gitRunCommand [qc|git ls-tree -t -l -r {pretty what}|] gitRunCommand [qc|git ls-tree -t -l -r {pretty what}|]
@ -171,7 +187,6 @@ gitReadTree what =
_ -> Nothing _ -> Nothing
<&> \s -> HM.elems (HM.fromList [ (gitEntryHash e, e) | e <- s]) <&> \s -> HM.elems (HM.fromList [ (gitEntryHash e, e) | e <- s])
class GitObjectReader a where class GitObjectReader a where
gitReadObjectMaybe :: forall m . MonadIO m => a -> GitHash -> m (Maybe (GitObjectType, ByteString)) gitReadObjectMaybe :: forall m . MonadIO m => a -> GitHash -> m (Maybe (GitObjectType, ByteString))
@ -707,6 +722,43 @@ readCommitChain _ h0 action = flip runContT pure $ callCC \_ -> do
debug $ "processed commit" <+> pretty h debug $ "processed commit" <+> pretty h
next $ RCC ( parents <> hs ) (HM.insertWith (<>) h (HS.fromList parents) seen) next $ RCC ( parents <> hs ) (HM.insertWith (<>) h (HS.fromList parents) seen)
data HCC = HCC Int [GitHash] (HashPSQ GitHash Int (HashSet GitHash))
readCommitChainHPSQ :: ( HBS2GitPerks m
, MonadUnliftIO m
, MonadReader Git3Env m
, HasStorage m
, HasStateDB m
)
=> Maybe GitRef
-> GitHash
-> (GitHash -> m ())
-> m (HashPSQ GitHash Int (HashSet GitHash))
readCommitChainHPSQ _ h0 action = flip runContT pure $ callCC \_ -> do
theReader <- ContT $ withGitCat
void $ ContT $ bracket (pure theReader) stopProcess
flip fix (HCC 0 [h0] HPSQ.empty) $ \next -> \case
HCC _ [] seen -> pure seen
HCC n ( h : hs ) seen | HPSQ.member h seen -> next ( HCC n hs seen )
HCC n ( h : hs ) seen -> do
co <- gitReadObjectMaybe theReader h
>>= orThrow(GitReadError $ show $ pretty "object not found" <+> pretty h)
parents <- gitReadCommitParents (Just h) (snd co)
lift $ action h
next $ HCC (n-1) ( parents <> hs ) (snd $ HPSQ.alter (addParents () n parents) h seen )
where
addParents :: a
-> Int
-> [GitHash]
-> Maybe (Int, HashSet GitHash)
-> (a, Maybe (Int, HashSet GitHash))
addParents a n p = \case
Nothing -> (a, Just (n, HS.fromList p))
Just (l,s) -> (a, Just (min l n, s <> HS.fromList p))
export :: ( HBS2GitPerks m export :: ( HBS2GitPerks m
, MonadUnliftIO m , MonadUnliftIO m
, MonadReader Git3Env m , MonadReader Git3Env m
@ -1550,12 +1602,72 @@ theDict = do
export (w <|> re <|> hd) r export (w <|> re <|> hd) r
entry $ bindMatch "test:git:read-commit-chain-full" $ nil_ $ \syn -> lift do entry $ bindMatch "test:git:read-commit-chain-full" $ nil_ $ \syn -> lift do
let (_, argz) = splitOpts [] syn let (opts, argz) = splitOpts [("--threads",1)] syn
xead <- headDef "HEAD" [ x | StringLike x <- argz ] & gitRevParseThrow xead <- headDef "HEAD" [ x | StringLike x <- argz ] & gitRevParseThrow
co <- readCommitChain Nothing xead dontHandle <&> HM.keys let tnum = headDef 1 [fromIntegral x | ListVal [StringLike "--threads", LitIntVal x] <- opts]
for_ co $ \c -> do
t <- gitReadTree c co <- readCommitChain Nothing xead dontHandle
debug $ "entries" <+> pretty c <+> pretty (length t) let total = HM.size co
_n <- newTVarIO 0
t0 <- getTimeCoarse
flip runContT pure do
_fake <- newTVarIO 0
void $ ContT $ withAsync $ flip fix (0,t0) $ \next (x,tt0) -> do
pause @'Seconds 1
t1 <- getTimeCoarse
n <- readTVarIO _n
let dt = 1e-9 * realToFrac (t1 - tt0)
-- when (dt <= 0) $ next (n,t1)
let dn = realToFrac (n-x)
let v = realToFrac $ dn / dt
let est = if v > 0 then realToFrac (total - n) / v :: Fixed E2 else 0
sz <- readTVarIO _fake
debug $ "read"
<+> pretty n <+> "/" <+> pretty total
<+> "elapsed" <+> pretty (ceiling $ realToFrac (t1-t0) * 1e-9 )
<+> "dn" <+> pretty dn
<+> "sz" <+> pretty (realToFrac sz / (1024*1024) :: Fixed E2)
<+> pretty (realToFrac @(Fixed E2) v) <+> "per sec"
<+> "est." <+> pretty est
next (n,t1)
_already <- newTVarIO ( mempty :: HashSet GitHash )
let chunks = chunksOf (total `div` tnum) (HM.keys co)
liftIO $ pooledForConcurrentlyN_ tnum chunks $ \chunk -> flip runContT pure do
theReader <- ContT $ withGitCat
void $ ContT $ bracket none (const $ stopProcess theReader)
for_ chunk $ \commit -> do
hashes <- gitReadTreeObjectsOnly commit
>>= filterM ( \x -> readTVarIO _already <&> not . HS.member x)
atomically $ modifyTVar _n succ
for_ hashes $ \gh -> do
(_t,lbs) <- gitReadObjectMaybe theReader gh
>>= orThrow (GitReadError (show $ pretty gh))
let l = sum $ fmap BS.length (LBS.toChunks lbs)
atomically do
modifyTVar' _fake (+ l)
modifyTVar' _already (HS.insert gh)
n <- readTVarIO _n
liftIO $ print $ pretty "read objects" <+> pretty n <+> "of" <+> pretty total
entry $ bindMatch "test:git:read-commit-chain" $ nil_ $ \syn -> do entry $ bindMatch "test:git:read-commit-chain" $ nil_ $ \syn -> do
(mpath, hss) <- case syn of (mpath, hss) <- case syn of
@ -1570,6 +1682,16 @@ theDict = do
r <- lift $ readCommitChain Nothing h dontHandle r <- lift $ readCommitChain Nothing h dontHandle
liftIO $ print ( HM.size r ) liftIO $ print ( HM.size r )
entry $ bindMatch "test:git:read-commit-chain-dfs" $ nil_ $ \syn -> lift do
let (_, argz) = splitOpts [] syn
let hd = headDef "HEAD" [ x | StringLike x <- argz]
h <- gitRevParseThrow hd
r <- readCommitChainHPSQ Nothing h (\c -> debug $ "commit" <+> pretty c)
<&> HPSQ.toList
<&> sortBy (comparing (view _2))
for_ r $ \(c,_,_) -> do
liftIO $ print $ pretty c
-- debugPrefix :: LoggerEntry -> LoggerEntry -- debugPrefix :: LoggerEntry -> LoggerEntry
debugPrefix = toStderr . logPrefix "[debug] " debugPrefix = toStderr . logPrefix "[debug] "