From 8a5ffe2e7bae2a2e6f7a5e41d8d6298b7c8c149a Mon Sep 17 00:00:00 2001 From: voidlizard Date: Sat, 11 Jan 2025 15:26:38 +0300 Subject: [PATCH] compact-index --- hbs2-git3/app/Main.hs | 60 +++--------------------- hbs2-git3/lib/HBS2/Git3/Prelude.hs | 1 - hbs2-git3/lib/HBS2/Git3/State/Index.hs | 64 ++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 54 deletions(-) diff --git a/hbs2-git3/app/Main.hs b/hbs2-git3/app/Main.hs index 2b45d0d8..b4b39268 100644 --- a/hbs2-git3/app/Main.hs +++ b/hbs2-git3/app/Main.hs @@ -68,9 +68,6 @@ import Data.Generics.Labels import Data.Generics.Product import Lens.Micro.Platform -import Data.Heap (Entry(..)) -import Data.Heap qualified as Heap - import Streaming.Prelude qualified as S import System.Exit qualified as Q @@ -400,46 +397,6 @@ mergeSortedFiles getKey file1 file2 outFile = do | otherwise = write y >> mergeEntries (x:xs) ys extractKey write -mergeSortedFilesN :: forall m . MonadUnliftIO m - => (N.ByteString -> N.ByteString) -- ^ Функция извлечения ключа - -> [FilePath] -- ^ Входные файлы - -> FilePath -- ^ Выходной файл - -> m () - -mergeSortedFilesN _ [] out = rm out - -mergeSortedFilesN _ [_] out = rm out - -mergeSortedFilesN getKey inputFiles outFile = do - - mmaped <- for inputFiles $ \fn -> do - liftIO (mmapFileByteString fn Nothing) - - liftIO $ UIO.withBinaryFileAtomic outFile WriteMode $ \hOut -> do - flip fix (mmaped, Heap.empty) $ \next (mmf, win) -> do - let (entries, files) = fmap readEntry mmf & unzip - let values = [ Entry (getKey e) e | e <- catMaybes entries ] - let e' = (win <> Heap.fromList values) & Heap.uncons - maybe1 e' none $ \(Entry _ e, newWin) -> do - liftIO $ writeSection (LBS.fromStrict e) (LBS.hPutStr hOut) - next (catMaybes files, newWin) - - mapM_ rm inputFiles - - where - readEntry :: BS.ByteString -> (Maybe BS.ByteString, Maybe BS.ByteString) - - readEntry src | BS.length src < 4 = (mzero, mzero) - - readEntry src = do - let (size, rest) = BS.splitAt 4 src & over _1 ( fromIntegral . N.word32 ) - let (e, rest2) = BS.splitAt size rest - - if BS.length e < size then - (mzero, mzero) - else - (Just e, Just rest2) - theDict :: forall m . ( HBS2GitPerks m -- , HasClientAPI PeerAPI UNIX m @@ -476,6 +433,8 @@ theDict = do _ -> throwIO (BadFormException @C nil) + entry $ bindValue "index-block-size" (mkInt $ 32 * 1024 * 1024) + entry $ bindMatch "git:tree:ls" $ nil_ $ const do r <- gitReadTree "HEAD" for_ r $ \GitTreeEntry{..} -> do @@ -1027,17 +986,12 @@ theDict = do on conflict (sha1) do update set tx = excluded.tx|] (p,h) - entry $ bindMatch "reflog:index:compact" $ nil_ $ \syn -> lift do - reflog <- getGitRemoteKey >>= orThrowUser "reflog not set" - idxPath <- getStatePath (AsBase58 reflog) <&> ( "index") - mkdir idxPath + entry $ bindMatch "reflog:index:compact" $ nil_ $ \_ -> do + size <- lookupValue "index-block-size" >>= \case + LitIntVal n -> pure (fromIntegral n) + _ -> pure 33554432 - files <- dirFiles idxPath - <&> filter ((== ".idx") . takeExtension) - - out <- liftIO $ emptyTempFile idxPath "objects-.idx" - - mergeSortedFilesN (BS.take 20) files out + lift $ compactIndex size entry $ bindMatch "reflog:index:path" $ nil_ $ const $ lift do indexPath >>= liftIO . print . pretty diff --git a/hbs2-git3/lib/HBS2/Git3/Prelude.hs b/hbs2-git3/lib/HBS2/Git3/Prelude.hs index f6ca25d4..814df0b0 100644 --- a/hbs2-git3/lib/HBS2/Git3/Prelude.hs +++ b/hbs2-git3/lib/HBS2/Git3/Prelude.hs @@ -106,7 +106,6 @@ class HasExportOpts m where getCompressionLevel :: m Int setCompressionLevel :: Int -> m () - class HasGitRemoteKey m where getGitRemoteKey :: m (Maybe GitRemoteKey) setGitRemoteKey :: GitRemoteKey -> m () diff --git a/hbs2-git3/lib/HBS2/Git3/State/Index.hs b/hbs2-git3/lib/HBS2/Git3/State/Index.hs index 42372288..e65b541f 100644 --- a/hbs2-git3/lib/HBS2/Git3/State/Index.hs +++ b/hbs2-git3/lib/HBS2/Git3/State/Index.hs @@ -13,6 +13,8 @@ import Data.ByteString.Lazy qualified as LBS import Data.List qualified as L import Network.ByteOrder qualified as N import System.IO.Temp as Temp +import Data.Heap (Entry(..)) +import Data.Heap qualified as Heap import Data.ByteString.Lazy qualified as LBS import Data.Fixed import Data.Maybe @@ -38,6 +40,7 @@ import Codec.Serialise import Streaming.Prelude qualified as S import Streaming hiding (run,chunksOf) import System.TimeIt +import Lens.Micro.Platform import UnliftIO import UnliftIO.IO.File qualified as UIO @@ -85,6 +88,65 @@ data Index a = Index { entries :: [IndexEntry] } + +mergeSortedFilesN :: forall m . MonadUnliftIO m + => (N.ByteString -> N.ByteString) -- ^ Функция извлечения ключа + -> [FilePath] -- ^ Входные файлы + -> FilePath -- ^ Выходной файл + -> m () + +mergeSortedFilesN _ [] out = rm out + +mergeSortedFilesN _ [_] out = rm out + +mergeSortedFilesN getKey inputFiles outFile = do + + mmaped <- for inputFiles $ \fn -> do + liftIO (mmapFileByteString fn Nothing) + + liftIO $ UIO.withBinaryFileAtomic outFile WriteMode $ \hOut -> do + flip fix (mmaped, Heap.empty) $ \next (mmf, win) -> do + let (entries, files) = fmap readEntry mmf & unzip + let values = [ Entry (getKey e) e | e <- catMaybes entries ] + let e' = (win <> Heap.fromList values) & Heap.uncons + maybe1 e' none $ \(Entry _ e, newWin) -> do + liftIO $ writeSection (LBS.fromStrict e) (LBS.hPutStr hOut) + next (catMaybes files, newWin) + + mapM_ rm inputFiles + + where + readEntry :: BS.ByteString -> (Maybe BS.ByteString, Maybe BS.ByteString) + + readEntry src | BS.length src < 4 = (mzero, mzero) + + readEntry src = do + let (size, rest) = BS.splitAt 4 src & over _1 ( fromIntegral . N.word32 ) + let (e, rest2) = BS.splitAt size rest + + if BS.length e < size then + (mzero, mzero) + else + (Just e, Just rest2) + +compactIndex :: forall m . (Git3Perks m, MonadReader Git3Env m) => Natural -> m () +compactIndex maxSize = do + reflog <- getGitRemoteKey >>= orThrowUser "reflog not set" + idxPath <- getStatePath (AsBase58 reflog) <&> ( "index") + mkdir idxPath + files <- listObjectIndexFiles <&> L.sortOn snd + + let blocks = fix (\next (acc, group, remaining) -> + case remaining of + [] -> [reverse group | not (null group)] + ((file, size) : rest) + | acc + size > maxSize -> reverse group : next (size, [(file, size)], rest) + | otherwise -> next (acc + size, (file, size) : group, rest)) + + forM_ (blocks (0, [], files)) $ \block -> do + out <- liftIO $ emptyTempFile idxPath "objects-.idx" + mergeSortedFilesN (BS.take 20) (map fst block) out + openIndex :: forall a m . (Git3Perks m, MonadReader Git3Env m) => m (Index a) @@ -294,3 +356,5 @@ updateReflogIndex = do -- notice $ pretty sha1 <+> pretty tx writeSection ( LBS.fromChunks [key,value] ) (LBS.hPutStr wh) + lift $ compactIndex ( 32 * 1024 * 1024 ) +