diff --git a/hbs2-git3/app/Main.hs b/hbs2-git3/app/Main.hs index a54cae65..fe0f8b7b 100644 --- a/hbs2-git3/app/Main.hs +++ b/hbs2-git3/app/Main.hs @@ -593,16 +593,9 @@ theDict = do liftIO $ mapM_ setCurrentDirectory mpath - rq <- newTQueueIO - ContT $ withAsync (startReflogIndexQueryQueue rq) + idx <- lift openIndex - let req h = do - let bs = coerce @GitHash @N.ByteString h - let tr = const True - w <- newEmptyTMVarIO - atomically $ writeTQueue rq (bs, tr, w) - r <- atomically $ readTMVar w - pure $ isNothing r + let req h = lift $ indexEntryLookup idx h <&> isNothing -- let hss = headDef "HEAD" [ x | StringLike x <- snd (splitOpts [] syn) ] h <- gitRevParseThrow hss @@ -611,7 +604,6 @@ theDict = do for_ (HPSQ.toList r) $ \(k,_,_) -> do liftIO $ print $ pretty k - entry $ bindMatch "test:git:log:cat" $ nil_ $ \syn -> lift do let (opts, argz) = splitOpts [("--git",0),("--packed",0),("--import",1)] syn @@ -754,23 +746,18 @@ theDict = do found <- liftIO $ binarySearchBS 56 (BS.take 20 . BS.drop 4) (coerce h) file liftIO $ notice $ pretty h <+> pretty (isJust found) - entry $ bindMatch "reflog:index:search" $ nil_ $ \syn -> lift $ flip runContT pure do + entry $ bindMatch "reflog:index:search" $ nil_ $ \syn -> lift do let (_, argz) = splitOpts [] syn hash <- headMay [ x | GitHashLike x <- argz ] & orThrowUser "need sha1" - rq <- newTQueueIO + idx <- openIndex - ContT $ withAsync $ startReflogIndexQueryQueue rq + answ <- indexEntryLookup idx hash - answ_ <- newEmptyTMVarIO - - atomically $ writeTQueue rq (coerce hash, const True, answ_) - - answ <- atomically $ readTMVar answ_ - - for_ answ $ \a -> do + for_ answ $ \bs -> do + let a = coerce (BS.take 32 bs) :: HashRef liftIO $ print $ pretty a entry $ bindMatch "test:git:log:index:flat:search:linear:test" $ nil_ $ \case @@ -1095,7 +1082,7 @@ theDict = do idx <- openIndex - let req h = lift $ indexEntryLookup idx h <&> isNothing + -- let req h = lift $ indexEntryLookup idx h <&> isNothing flip runContT pure do cap <- liftIO getNumCapabilities @@ -1114,7 +1101,7 @@ theDict = do -- читаем только те объекты, которые не в индексе hashes <- gitReadTreeObjectsOnly commit <&> ([commit,tree]<>) - >>= filterM req + >>= lift . indexFilterNewObjects idx . HS.fromList -- atomically $ mapM_ (writeTQueue new_) hashes atomically (STM.flushTQueue new_) >>= liftIO . print . pretty . length @@ -1131,63 +1118,6 @@ theDict = do void $ flip runContT pure do - -- cache <- newTVarIO ( mempty :: HashSet GitHash ) - - -- читаем вообще всё из индекса в память и строим HashSet - -- получается, что вообще никакого профита, что это индекс, - -- это фуллскан в любом случае. - -- Индекс это сортированная последовательность [(GitHash, HashRef)] - -- в виде байстроки формата "SD", D ~ GitHash <> HashRef - - -- let blm = runST (MBloom.new undefined 1000000) - -- - -- bloom <- liftIO $ stToIO $ MBloom.new 10000 - -- - - index <- lift openIndex - - -- let req h = do - -- atomically do - -- readTVar cache <&> not . HS.member h - - -- let - -- req2 :: GitHash -> Git3 m Bool - -- req2 h = liftIO do - -- here <- liftIO $ stToIO $ MBloom.elem h bloom - - -- if not here then pure True else do - -- atomically $ modifyTVar blmn_ succ - -- forConcurrently_ files $ \f -> do - -- found <- binarySearchBS 56 ( BS.take 20. BS.drop 4 ) (coerce h) f - -- when (isJust found) do - -- atomically $ modifyTVar excl_ (HS.insert h) - -- readTVarIO excl_ <&> not . HS.member h - - -- req3 :: HashSet GitHash -> Git3 m (HashSet GitHash) - -- req3 hs = liftIO do - - -- forConcurrently_ files $ \f -> do - -- flip fix (HS.toList hs) $ \next -> \case - -- [] -> none - -- (x:xs) -> do - -- already <- readTVarIO excl_ <&> HS.member x - -- inBloom <- liftIO $ stToIO $ MBloom.elem x bloom - - -- when inBloom do - -- atomically $ modifyTVar blmn_ succ - - -- when (not already || inBloom) do - -- found <- binarySearchBS 56 ( BS.take 20. BS.drop 4 ) (coerce x) f - -- when (isJust found) do - -- atomically $ modifyTVar excl_ (HS.insert x) - -- next xs - - -- found <- readTVarIO excl_ - -- pure ( hs `HS.difference` found) - - -- читаем только те коммиты, которые не в индексе - -- очень быстро, пушо относительно мало объектов - idx <- lift openIndex let req h = lift $ indexEntryLookup idx h <&> isNothing @@ -1217,7 +1147,7 @@ theDict = do notice $ "all shit read" <+> pretty (realToFrac @_ @(Fixed E2) t3) - (t4,new) <- lift $ timeItT $ readTVarIO uniq_ >>= filterM req . HS.toList + (t4,new) <- lift $ timeItT $ readTVarIO uniq_ >>= indexFilterNewObjects idx notice $ pretty (length new) <+> "new objects" <+> "at" <+> pretty (realToFrac @_ @(Fixed E2) t4) diff --git a/hbs2-git3/lib/HBS2/Git3/State/Index.hs b/hbs2-git3/lib/HBS2/Git3/State/Index.hs index 8e7fdfb0..292b2315 100644 --- a/hbs2-git3/lib/HBS2/Git3/State/Index.hs +++ b/hbs2-git3/lib/HBS2/Git3/State/Index.hs @@ -17,6 +17,8 @@ import Data.ByteString.Lazy qualified as LBS import Data.Maybe import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict qualified as HM +import Data.HashSet (HashSet) +import Data.HashSet qualified as HS import Data.Word import Data.Vector (Vector) import Data.Vector qualified as V @@ -79,7 +81,7 @@ data IndexEntry = data Index a = Index { entries :: [IndexEntry] - , bitmap :: MBloom.MBloom RealWorld GitHash + , bitmap :: Bloom GitHash } openIndex :: forall a m . (Git3Perks m, MonadReader Git3Env m) @@ -89,8 +91,20 @@ openIndex = do files <- listObjectIndexFiles bss <- liftIO $ for files $ \(f,_) -> (f,) <$> mmapFileByteString f Nothing let entries = [ IndexEntry f bs | (f,bs) <- bss ] - bloom <- liftIO $ stToIO $ MBloom.new bloomHash 10 - pure $ Index entries bloom + let n = sum (fmap snd files) + let bss = bloomFilterSize n 5 0.01 & fromIntegral + + bloom <- liftIO $ stToIO $ MBloom.new bloomHash bss + + let idx = Index entries undefined + + enumEntries idx $ \bs -> do + let h = coerce (BS.take 20 bs) :: GitHash + liftIO $ stToIO (MBloom.insert bloom h) + + bm <- liftIO $ stToIO $ Bloom.freeze bloom + + pure $ idx { bitmap = bm } indexEntryLookup :: forall a m . (Git3Perks m) => Index a @@ -101,9 +115,11 @@ indexEntryLookup Index{..} h = do already_ <- newTVarIO ( mempty :: HashMap GitHash N.ByteString ) forConcurrently_ entries $ \IndexEntry{..} -> do what <- readTVarIO already_ <&> HM.lookup h - case what of - Just{} -> none - Nothing -> do + let inBloom = Bloom.elem h bitmap + case (inBloom,what) of + (False,_) -> none + (_,Just{}) -> none + (_,Nothing) -> do offset' <- binarySearchBS 56 ( BS.take 20 . BS.drop 4 ) (coerce h) entryBS maybe1 offset' none $ \offset -> do let ebs = BS.take 32 $ BS.drop (offset + 4 + 20) entryBS @@ -111,6 +127,33 @@ indexEntryLookup Index{..} h = do readTVarIO already_ <&> headMay . HM.elems +indexFilterNewObjects :: forall a m . (Git3Perks m) + => Index a + -> HashSet GitHash + -> m [GitHash] + +indexFilterNewObjects Index{..} hashes = do + old_ <- newTVarIO ( mempty :: HashSet GitHash ) + forConcurrently_ entries $ \IndexEntry{..} -> do + flip fix (HS.toList hashes) $ \next -> \case + [] -> none + (x:xs) -> do + let inBloom = Bloom.elem x bitmap + if not inBloom then + next xs + else do + old <- readTVarIO old_ <&> HS.member x + if old then + next xs + else do + off <- binarySearchBS 56 ( BS.take 20 . BS.drop 4 ) (coerce x) entryBS + when (isJust off) do + atomically $ modifyTVar old_ (HS.insert x) + next xs + + old <- readTVarIO old_ + pure $ HS.toList (hashes `HS.difference` old) + listObjectIndexFiles :: forall m . ( Git3Perks m , MonadReader Git3Env m ) => m [(FilePath, Natural)] @@ -142,58 +185,10 @@ bloomHash gh = [a,b,c,d,e] d = N.word32 (BS.take 4 $ BS.drop 12 bs) e = N.word32 (BS.take 4 $ BS.drop 16 bs) -startReflogIndexQueryQueue :: forall a m . ( Git3Perks m - , MonadReader Git3Env m - , HasClientAPI PeerAPI UNIX m - , HasClientAPI RefLogAPI UNIX m - , HasStorage m - ) - => TQueue (BS.ByteString, BS.ByteString -> a, TMVar (Maybe a)) - -> m () - -startReflogIndexQueryQueue rq = flip runContT pure do - files <- lift $ listObjectIndexFiles <&> fmap fst - - -- один файл - не более, чем один поток - -- мапим файлы - -- возвращаем функцию запроса? - -- для каждого файла -- мы создаём отдельную очередь, - -- нам надо искать во всех файлах - - mmaped <- liftIO $ for files (liftIO . flip mmapFileByteString Nothing) - - answQ <- newTVarIO mempty - - forever $ liftIO do - requests <- atomically do - _ <- peekTQueue rq - w <- STM.flushTQueue rq - for_ w $ \(k,_,a) -> do - modifyTVar answQ (HM.insert k a) - pure w - - forConcurrently_ mmaped \bs -> do - for requests $ \(s,f,answ) -> runMaybeT do - - still <- readTVarIO answQ <&> HM.member s - - guard still - - -- FIXME: size-hardcodes - w <- binarySearchBS 56 ( BS.take 20 . BS.drop 4 ) s bs - >>= toMPlus - - let r = f (BS.drop (w * 56) bs) - - atomically do - writeTMVar answ (Just r) - modifyTVar answQ (HM.delete bs) - - atomically do - rest <- readTVar answQ - for_ rest $ \x -> writeTMVar x Nothing - -bloomFilterSize :: Natural -> Natural -> Double -> Natural +bloomFilterSize :: Natural -- ^ elems? + -> Natural -- ^ hash functions + -> Double -- ^ error probability + -> Natural bloomFilterSize n k p | p <= 0 || p >= 1 = 0 | otherwise = rnd $ negate (fromIntegral n * fromIntegral k) / log (1 - p ** (1 / fromIntegral k))