Skip to content

Commit 68961cb

Browse files
authored
Merge pull request #148 from haskell-works/on-the-fly-indexing-support
On the fly indexing support for large files
2 parents f635bd1 + 322cf36 commit 68961cb

File tree

4 files changed

+53
-32
lines changed

4 files changed

+53
-32
lines changed

app/App/Commands/Count.hs

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{-# LANGUAGE BangPatterns #-}
12
{-# LANGUAGE DataKinds #-}
23
{-# LANGUAGE OverloadedStrings #-}
34
{-# LANGUAGE ScopedTypeVariables #-}
@@ -18,19 +19,23 @@ import HaskellWorks.Data.Json.Query
1819
import HaskellWorks.Data.Json.Standard.Cursor.Generic
1920
import HaskellWorks.Data.MQuery
2021
import HaskellWorks.Data.MQuery.Micro
21-
import HaskellWorks.Data.RankSelect.CsPoppy
22+
import HaskellWorks.Data.RankSelect.CsPoppy1
2223
import HaskellWorks.Data.TreeCursor
2324
import Options.Applicative hiding (columns)
2425

25-
import qualified App.Commands.Types as Z
26-
import qualified Data.ByteString as BS
27-
import qualified Data.ByteString.Internal as BSI
28-
import qualified Data.DList as DL
29-
import qualified Data.Vector.Storable as DVS
30-
import qualified HaskellWorks.Data.BalancedParens.RangeMin as RM
31-
import qualified System.IO.MMap as IO
26+
import qualified App.Commands.Types as Z
27+
import qualified Data.ByteString as BS
28+
import qualified Data.ByteString.Internal as BSI
29+
import qualified Data.DList as DL
30+
import qualified Data.Vector.Storable as DVS
31+
import qualified HaskellWorks.Data.BalancedParens.RangeMin as RM
32+
import qualified HaskellWorks.Data.Json.Standard.Cursor.Fast as JCF
33+
import qualified HaskellWorks.Data.Json.Standard.Cursor.Internal.IbBp as IBBP
34+
import qualified HaskellWorks.Data.RankSelect.CsPoppy.Internal.Alpha1 as A1
35+
import qualified System.IO as IO
36+
import qualified System.IO.MMap as IO
3237

33-
siblings :: GenericCursor BSI.ByteString CsPoppy (RM.RangeMin CsPoppy) -> [GenericCursor BSI.ByteString CsPoppy (RM.RangeMin CsPoppy)]
38+
siblings :: GenericCursor BSI.ByteString CsPoppy1 (RM.RangeMin CsPoppy1) -> [GenericCursor BSI.ByteString CsPoppy1 (RM.RangeMin CsPoppy1)]
3439
siblings c = c:cs
3540
where cs = case nextSibling c of
3641
Just d -> siblings d
@@ -39,24 +44,47 @@ siblings c = c:cs
3944
runCount :: Z.CountOptions -> IO ()
4045
runCount opts = do
4146
let inputFile = opts ^. the @"inputFile"
42-
let ibIndex = opts ^. the @"ibIndex"
43-
let bpIndex = opts ^. the @"bpIndex"
4447
let expression = opts ^. the @"expression"
4548
jsonFr <- IO.mmapFileForeignPtr inputFile IO.ReadOnly Nothing
46-
jsonIbFr <- IO.mmapFileForeignPtr ibIndex IO.ReadOnly Nothing
47-
jsonBpFr <- IO.mmapFileForeignPtr bpIndex IO.ReadOnly Nothing
4849
let jsonBs = fromForeignRegion jsonFr :: BS.ByteString
49-
let jsonIb = fromForeignRegion jsonIbFr :: DVS.Vector Word64
50-
let jsonBp = fromForeignRegion jsonBpFr :: DVS.Vector Word64
5150

52-
let cursor = GenericCursor jsonBs (makeCsPoppy jsonIb) (RM.mkRangeMin (makeCsPoppy jsonBp)) 1
51+
cursor <- case opts ^. the @"indexes" of
52+
Just indexes -> do
53+
let ibIndex = indexes ^. the @"ibIndex"
54+
let bpIndex = indexes ^. the @"bpIndex"
55+
jsonIbFr <- IO.mmapFileForeignPtr ibIndex IO.ReadOnly Nothing
56+
jsonBpFr <- IO.mmapFileForeignPtr bpIndex IO.ReadOnly Nothing
57+
let jsonIb = fromForeignRegion jsonIbFr :: DVS.Vector Word64
58+
let jsonBp = fromForeignRegion jsonBpFr :: DVS.Vector Word64
59+
60+
return $ GenericCursor jsonBs (makeCsPoppy jsonIb) (RM.mkRangeMin (makeCsPoppy jsonBp)) 1
61+
Nothing -> do
62+
IO.putStrLn "Running"
63+
let !ibip = JCF.simdToIbBp jsonBs
64+
let !_ = A1.makeCsPoppyIndex (IBBP.ib ibip)
65+
let !c = JCF.fromBsIbBp jsonBs ibip
66+
IO.putStrLn "Created cursor"
67+
return c
5368

5469
let q = MQuery (DL.fromList $ fmap lightJsonAt (siblings cursor))
5570

5671
putPretty $ q >>= (entry >=> named expression) & count
5772

5873
return ()
5974

75+
optsFileIndex :: Parser Z.FileIndexes
76+
optsFileIndex = Z.FileIndexes
77+
<$> strOption
78+
( long "ib-index"
79+
<> help "IB index"
80+
<> metavar "FILE"
81+
)
82+
<*> strOption
83+
( long "bp-index"
84+
<> help "BP index"
85+
<> metavar "FILE"
86+
)
87+
6088
optsCount :: Parser Z.CountOptions
6189
optsCount = Z.CountOptions
6290
<$> strOption
@@ -65,16 +93,7 @@ optsCount = Z.CountOptions
6593
<> help "Input JSON file"
6694
<> metavar "FILE"
6795
)
68-
<*> strOption
69-
( long "ib-index"
70-
<> help "IB index"
71-
<> metavar "FILE"
72-
)
73-
<*> strOption
74-
( long "bp-index"
75-
<> help "BP index"
76-
<> metavar "FILE"
77-
)
96+
<*> optional optsFileIndex
7897
<*> option auto
7998
( long "expression"
8099
<> help "JSON expression"

app/App/Commands/Types.hs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ module App.Commands.Types
55
( CountOptions(..)
66
, CreateIndexOptions(..)
77
, DemoOptions(..)
8+
, FileIndexes(..)
89
) where
910

1011
import Data.Text (Text)
@@ -25,7 +26,11 @@ data DemoOptions = DemoOptions
2526

2627
data CountOptions = CountOptions
2728
{ inputFile :: FilePath
28-
, ibIndex :: FilePath
29-
, bpIndex :: FilePath
29+
, indexes :: Maybe FileIndexes
3030
, expression :: Text
3131
} deriving (Eq, Show, Generic)
32+
33+
data FileIndexes = FileIndexes
34+
{ ibIndex :: FilePath
35+
, bpIndex :: FilePath
36+
} deriving (Eq, Show, Generic)

hw-json.cabal

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ common hw-bits { build-depends: hw-bits >= 0.7
5656
common hw-hspec-hedgehog { build-depends: hw-hspec-hedgehog >= 0.1.0.4 && < 0.2 }
5757
common hw-json-simd { build-depends: hw-json-simd >= 0.1.0.2 && < 0.2 }
5858
common hw-json-simple-cursor { build-depends: hw-json-simple-cursor >= 0.1.0.1 && < 0.2 }
59-
common hw-json-standard-cursor { build-depends: hw-json-standard-cursor >= 0.1.1.0 && < 0.2 }
59+
common hw-json-standard-cursor { build-depends: hw-json-standard-cursor >= 0.2.1.0 && < 0.3 }
6060
common hw-mquery { build-depends: hw-mquery >= 0.2.0.0 && < 0.3 }
6161
common hw-parser { build-depends: hw-parser >= 0.1 && < 0.2 }
6262
common hw-prim { build-depends: hw-prim >= 0.6.2.28 && < 0.7 }

src/HaskellWorks/Data/Json/Internal/Slurp.hs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ module HaskellWorks.Data.Json.Internal.Slurp
66
, slurpNumber
77
) where
88

9-
import Data.String
109
import Data.Text
1110
import Data.Word
1211
import Data.Word8
@@ -16,8 +15,6 @@ import Prelude hiding (drop)
1615
import qualified Data.Aeson.Parser.Internal as AP
1716
import qualified Data.Attoparsec.ByteString as PBS
1817
import qualified Data.ByteString as BS
19-
import qualified Data.ByteString.Char8 as BSC
20-
import qualified Data.List as L
2118
import qualified Data.Text as T
2219

2320
data JsonState

0 commit comments

Comments
 (0)