compact-index

This commit is contained in:
voidlizard 2025-01-11 15:26:38 +03:00
parent 48132864a6
commit 8a5ffe2e7b
3 changed files with 71 additions and 54 deletions

View File

@ -68,9 +68,6 @@ import Data.Generics.Labels
import Data.Generics.Product
import Lens.Micro.Platform
import Data.Heap (Entry(..))
import Data.Heap qualified as Heap
import Streaming.Prelude qualified as S
import System.Exit qualified as Q
@ -400,46 +397,6 @@ mergeSortedFiles getKey file1 file2 outFile = do
| otherwise = write y >> mergeEntries (x:xs) ys extractKey write
mergeSortedFilesN :: forall m . MonadUnliftIO m
=> (N.ByteString -> N.ByteString) -- ^ Функция извлечения ключа
-> [FilePath] -- ^ Входные файлы
-> FilePath -- ^ Выходной файл
-> m ()
mergeSortedFilesN _ [] out = rm out
mergeSortedFilesN _ [_] out = rm out
mergeSortedFilesN getKey inputFiles outFile = do
mmaped <- for inputFiles $ \fn -> do
liftIO (mmapFileByteString fn Nothing)
liftIO $ UIO.withBinaryFileAtomic outFile WriteMode $ \hOut -> do
flip fix (mmaped, Heap.empty) $ \next (mmf, win) -> do
let (entries, files) = fmap readEntry mmf & unzip
let values = [ Entry (getKey e) e | e <- catMaybes entries ]
let e' = (win <> Heap.fromList values) & Heap.uncons
maybe1 e' none $ \(Entry _ e, newWin) -> do
liftIO $ writeSection (LBS.fromStrict e) (LBS.hPutStr hOut)
next (catMaybes files, newWin)
mapM_ rm inputFiles
where
readEntry :: BS.ByteString -> (Maybe BS.ByteString, Maybe BS.ByteString)
readEntry src | BS.length src < 4 = (mzero, mzero)
readEntry src = do
let (size, rest) = BS.splitAt 4 src & over _1 ( fromIntegral . N.word32 )
let (e, rest2) = BS.splitAt size rest
if BS.length e < size then
(mzero, mzero)
else
(Just e, Just rest2)
theDict :: forall m . ( HBS2GitPerks m
-- , HasClientAPI PeerAPI UNIX m
@ -476,6 +433,8 @@ theDict = do
_ -> throwIO (BadFormException @C nil)
entry $ bindValue "index-block-size" (mkInt $ 32 * 1024 * 1024)
entry $ bindMatch "git:tree:ls" $ nil_ $ const do
r <- gitReadTree "HEAD"
for_ r $ \GitTreeEntry{..} -> do
@ -1027,17 +986,12 @@ theDict = do
on conflict (sha1)
do update set tx = excluded.tx|] (p,h)
entry $ bindMatch "reflog:index:compact" $ nil_ $ \syn -> lift do
reflog <- getGitRemoteKey >>= orThrowUser "reflog not set"
idxPath <- getStatePath (AsBase58 reflog) <&> (</> "index")
mkdir idxPath
entry $ bindMatch "reflog:index:compact" $ nil_ $ \_ -> do
size <- lookupValue "index-block-size" >>= \case
LitIntVal n -> pure (fromIntegral n)
_ -> pure 33554432
files <- dirFiles idxPath
<&> filter ((== ".idx") . takeExtension)
out <- liftIO $ emptyTempFile idxPath "objects-.idx"
mergeSortedFilesN (BS.take 20) files out
lift $ compactIndex size
entry $ bindMatch "reflog:index:path" $ nil_ $ const $ lift do
indexPath >>= liftIO . print . pretty

View File

@ -106,7 +106,6 @@ class HasExportOpts m where
getCompressionLevel :: m Int
setCompressionLevel :: Int -> m ()
class HasGitRemoteKey m where
getGitRemoteKey :: m (Maybe GitRemoteKey)
setGitRemoteKey :: GitRemoteKey -> m ()

View File

@ -13,6 +13,8 @@ import Data.ByteString.Lazy qualified as LBS
import Data.List qualified as L
import Network.ByteOrder qualified as N
import System.IO.Temp as Temp
import Data.Heap (Entry(..))
import Data.Heap qualified as Heap
import Data.ByteString.Lazy qualified as LBS
import Data.Fixed
import Data.Maybe
@ -38,6 +40,7 @@ import Codec.Serialise
import Streaming.Prelude qualified as S
import Streaming hiding (run,chunksOf)
import System.TimeIt
import Lens.Micro.Platform
import UnliftIO
import UnliftIO.IO.File qualified as UIO
@ -85,6 +88,65 @@ data Index a =
Index { entries :: [IndexEntry]
}
mergeSortedFilesN :: forall m . MonadUnliftIO m
=> (N.ByteString -> N.ByteString) -- ^ Функция извлечения ключа
-> [FilePath] -- ^ Входные файлы
-> FilePath -- ^ Выходной файл
-> m ()
mergeSortedFilesN _ [] out = rm out
mergeSortedFilesN _ [_] out = rm out
mergeSortedFilesN getKey inputFiles outFile = do
mmaped <- for inputFiles $ \fn -> do
liftIO (mmapFileByteString fn Nothing)
liftIO $ UIO.withBinaryFileAtomic outFile WriteMode $ \hOut -> do
flip fix (mmaped, Heap.empty) $ \next (mmf, win) -> do
let (entries, files) = fmap readEntry mmf & unzip
let values = [ Entry (getKey e) e | e <- catMaybes entries ]
let e' = (win <> Heap.fromList values) & Heap.uncons
maybe1 e' none $ \(Entry _ e, newWin) -> do
liftIO $ writeSection (LBS.fromStrict e) (LBS.hPutStr hOut)
next (catMaybes files, newWin)
mapM_ rm inputFiles
where
readEntry :: BS.ByteString -> (Maybe BS.ByteString, Maybe BS.ByteString)
readEntry src | BS.length src < 4 = (mzero, mzero)
readEntry src = do
let (size, rest) = BS.splitAt 4 src & over _1 ( fromIntegral . N.word32 )
let (e, rest2) = BS.splitAt size rest
if BS.length e < size then
(mzero, mzero)
else
(Just e, Just rest2)
compactIndex :: forall m . (Git3Perks m, MonadReader Git3Env m) => Natural -> m ()
compactIndex maxSize = do
reflog <- getGitRemoteKey >>= orThrowUser "reflog not set"
idxPath <- getStatePath (AsBase58 reflog) <&> (</> "index")
mkdir idxPath
files <- listObjectIndexFiles <&> L.sortOn snd
let blocks = fix (\next (acc, group, remaining) ->
case remaining of
[] -> [reverse group | not (null group)]
((file, size) : rest)
| acc + size > maxSize -> reverse group : next (size, [(file, size)], rest)
| otherwise -> next (acc + size, (file, size) : group, rest))
forM_ (blocks (0, [], files)) $ \block -> do
out <- liftIO $ emptyTempFile idxPath "objects-.idx"
mergeSortedFilesN (BS.take 20) (map fst block) out
openIndex :: forall a m . (Git3Perks m, MonadReader Git3Env m)
=> m (Index a)
@ -294,3 +356,5 @@ updateReflogIndex = do
-- notice $ pretty sha1 <+> pretty tx
writeSection ( LBS.fromChunks [key,value] ) (LBS.hPutStr wh)
lift $ compactIndex ( 32 * 1024 * 1024 )