compact-index

This commit is contained in:
voidlizard 2025-01-11 15:26:38 +03:00
parent 48132864a6
commit 8a5ffe2e7b
3 changed files with 71 additions and 54 deletions

View File

@ -68,9 +68,6 @@ import Data.Generics.Labels
import Data.Generics.Product import Data.Generics.Product
import Lens.Micro.Platform import Lens.Micro.Platform
import Data.Heap (Entry(..))
import Data.Heap qualified as Heap
import Streaming.Prelude qualified as S import Streaming.Prelude qualified as S
import System.Exit qualified as Q import System.Exit qualified as Q
@ -400,46 +397,6 @@ mergeSortedFiles getKey file1 file2 outFile = do
| otherwise = write y >> mergeEntries (x:xs) ys extractKey write | otherwise = write y >> mergeEntries (x:xs) ys extractKey write
mergeSortedFilesN :: forall m . MonadUnliftIO m
=> (N.ByteString -> N.ByteString) -- ^ Функция извлечения ключа
-> [FilePath] -- ^ Входные файлы
-> FilePath -- ^ Выходной файл
-> m ()
mergeSortedFilesN _ [] out = rm out
mergeSortedFilesN _ [_] out = rm out
mergeSortedFilesN getKey inputFiles outFile = do
mmaped <- for inputFiles $ \fn -> do
liftIO (mmapFileByteString fn Nothing)
liftIO $ UIO.withBinaryFileAtomic outFile WriteMode $ \hOut -> do
flip fix (mmaped, Heap.empty) $ \next (mmf, win) -> do
let (entries, files) = fmap readEntry mmf & unzip
let values = [ Entry (getKey e) e | e <- catMaybes entries ]
let e' = (win <> Heap.fromList values) & Heap.uncons
maybe1 e' none $ \(Entry _ e, newWin) -> do
liftIO $ writeSection (LBS.fromStrict e) (LBS.hPutStr hOut)
next (catMaybes files, newWin)
mapM_ rm inputFiles
where
readEntry :: BS.ByteString -> (Maybe BS.ByteString, Maybe BS.ByteString)
readEntry src | BS.length src < 4 = (mzero, mzero)
readEntry src = do
let (size, rest) = BS.splitAt 4 src & over _1 ( fromIntegral . N.word32 )
let (e, rest2) = BS.splitAt size rest
if BS.length e < size then
(mzero, mzero)
else
(Just e, Just rest2)
theDict :: forall m . ( HBS2GitPerks m theDict :: forall m . ( HBS2GitPerks m
-- , HasClientAPI PeerAPI UNIX m -- , HasClientAPI PeerAPI UNIX m
@ -476,6 +433,8 @@ theDict = do
_ -> throwIO (BadFormException @C nil) _ -> throwIO (BadFormException @C nil)
entry $ bindValue "index-block-size" (mkInt $ 32 * 1024 * 1024)
entry $ bindMatch "git:tree:ls" $ nil_ $ const do entry $ bindMatch "git:tree:ls" $ nil_ $ const do
r <- gitReadTree "HEAD" r <- gitReadTree "HEAD"
for_ r $ \GitTreeEntry{..} -> do for_ r $ \GitTreeEntry{..} -> do
@ -1027,17 +986,12 @@ theDict = do
on conflict (sha1) on conflict (sha1)
do update set tx = excluded.tx|] (p,h) do update set tx = excluded.tx|] (p,h)
entry $ bindMatch "reflog:index:compact" $ nil_ $ \syn -> lift do entry $ bindMatch "reflog:index:compact" $ nil_ $ \_ -> do
reflog <- getGitRemoteKey >>= orThrowUser "reflog not set" size <- lookupValue "index-block-size" >>= \case
idxPath <- getStatePath (AsBase58 reflog) <&> (</> "index") LitIntVal n -> pure (fromIntegral n)
mkdir idxPath _ -> pure 33554432
files <- dirFiles idxPath lift $ compactIndex size
<&> filter ((== ".idx") . takeExtension)
out <- liftIO $ emptyTempFile idxPath "objects-.idx"
mergeSortedFilesN (BS.take 20) files out
entry $ bindMatch "reflog:index:path" $ nil_ $ const $ lift do entry $ bindMatch "reflog:index:path" $ nil_ $ const $ lift do
indexPath >>= liftIO . print . pretty indexPath >>= liftIO . print . pretty

View File

@ -106,7 +106,6 @@ class HasExportOpts m where
getCompressionLevel :: m Int getCompressionLevel :: m Int
setCompressionLevel :: Int -> m () setCompressionLevel :: Int -> m ()
class HasGitRemoteKey m where class HasGitRemoteKey m where
getGitRemoteKey :: m (Maybe GitRemoteKey) getGitRemoteKey :: m (Maybe GitRemoteKey)
setGitRemoteKey :: GitRemoteKey -> m () setGitRemoteKey :: GitRemoteKey -> m ()

View File

@ -13,6 +13,8 @@ import Data.ByteString.Lazy qualified as LBS
import Data.List qualified as L import Data.List qualified as L
import Network.ByteOrder qualified as N import Network.ByteOrder qualified as N
import System.IO.Temp as Temp import System.IO.Temp as Temp
import Data.Heap (Entry(..))
import Data.Heap qualified as Heap
import Data.ByteString.Lazy qualified as LBS import Data.ByteString.Lazy qualified as LBS
import Data.Fixed import Data.Fixed
import Data.Maybe import Data.Maybe
@ -38,6 +40,7 @@ import Codec.Serialise
import Streaming.Prelude qualified as S import Streaming.Prelude qualified as S
import Streaming hiding (run,chunksOf) import Streaming hiding (run,chunksOf)
import System.TimeIt import System.TimeIt
import Lens.Micro.Platform
import UnliftIO import UnliftIO
import UnliftIO.IO.File qualified as UIO import UnliftIO.IO.File qualified as UIO
@ -85,6 +88,65 @@ data Index a =
Index { entries :: [IndexEntry] Index { entries :: [IndexEntry]
} }
mergeSortedFilesN :: forall m . MonadUnliftIO m
=> (N.ByteString -> N.ByteString) -- ^ Функция извлечения ключа
-> [FilePath] -- ^ Входные файлы
-> FilePath -- ^ Выходной файл
-> m ()
mergeSortedFilesN _ [] out = rm out
mergeSortedFilesN _ [_] out = rm out
mergeSortedFilesN getKey inputFiles outFile = do
mmaped <- for inputFiles $ \fn -> do
liftIO (mmapFileByteString fn Nothing)
liftIO $ UIO.withBinaryFileAtomic outFile WriteMode $ \hOut -> do
flip fix (mmaped, Heap.empty) $ \next (mmf, win) -> do
let (entries, files) = fmap readEntry mmf & unzip
let values = [ Entry (getKey e) e | e <- catMaybes entries ]
let e' = (win <> Heap.fromList values) & Heap.uncons
maybe1 e' none $ \(Entry _ e, newWin) -> do
liftIO $ writeSection (LBS.fromStrict e) (LBS.hPutStr hOut)
next (catMaybes files, newWin)
mapM_ rm inputFiles
where
readEntry :: BS.ByteString -> (Maybe BS.ByteString, Maybe BS.ByteString)
readEntry src | BS.length src < 4 = (mzero, mzero)
readEntry src = do
let (size, rest) = BS.splitAt 4 src & over _1 ( fromIntegral . N.word32 )
let (e, rest2) = BS.splitAt size rest
if BS.length e < size then
(mzero, mzero)
else
(Just e, Just rest2)
compactIndex :: forall m . (Git3Perks m, MonadReader Git3Env m) => Natural -> m ()
compactIndex maxSize = do
reflog <- getGitRemoteKey >>= orThrowUser "reflog not set"
idxPath <- getStatePath (AsBase58 reflog) <&> (</> "index")
mkdir idxPath
files <- listObjectIndexFiles <&> L.sortOn snd
let blocks = fix (\next (acc, group, remaining) ->
case remaining of
[] -> [reverse group | not (null group)]
((file, size) : rest)
| acc + size > maxSize -> reverse group : next (size, [(file, size)], rest)
| otherwise -> next (acc + size, (file, size) : group, rest))
forM_ (blocks (0, [], files)) $ \block -> do
out <- liftIO $ emptyTempFile idxPath "objects-.idx"
mergeSortedFilesN (BS.take 20) (map fst block) out
openIndex :: forall a m . (Git3Perks m, MonadReader Git3Env m) openIndex :: forall a m . (Git3Perks m, MonadReader Git3Env m)
=> m (Index a) => m (Index a)
@ -294,3 +356,5 @@ updateReflogIndex = do
-- notice $ pretty sha1 <+> pretty tx -- notice $ pretty sha1 <+> pretty tx
writeSection ( LBS.fromChunks [key,value] ) (LBS.hPutStr wh) writeSection ( LBS.fromChunks [key,value] ) (LBS.hPutStr wh)
lift $ compactIndex ( 32 * 1024 * 1024 )