@@ -4,11 +4,12 @@ import ai.chronon.orchestration.RepoIndex._
4
4
import ai .chronon .orchestration .RepoTypes ._
5
5
import ai .chronon .orchestration .utils .CollectionExtensions .IteratorExtensions
6
6
import ai .chronon .orchestration .utils .SequenceMap
7
+ import ai .chronon .orchestration .utils .StringExtensions .StringOps
7
8
import org .apache .logging .log4j .scala .Logging
8
9
9
10
import scala .collection .mutable
10
11
11
- class RepoIndex [T ](proc : ConfProcessor [T ]) extends Logging {
12
+ class RepoIndex [T >: Null ](proc : ConfProcessor [T ]) extends Logging {
12
13
13
14
// first pass updates
14
15
private val branchToFileHash : TriMap [Branch , Name , FileHash ] = mutable.Map .empty
@@ -19,30 +20,14 @@ class RepoIndex[T](proc: ConfProcessor[T]) extends Logging {
19
20
private val versionSequencer : SequenceMap [Name , GlobalHash ] = new SequenceMap [Name , GlobalHash ]
20
21
21
22
def addNodes (fileHashes : mutable.Map [Name , FileHash ],
22
- nodes : Seq [T ],
23
+ newNodes : Seq [T ],
23
24
branch : Branch ,
24
25
dryRun : Boolean = true ): Seq [VersionUpdate ] = {
25
26
26
- val newContents = nodes.map { node =>
27
- val data = proc.toLocalData(node)
28
- val nodeContent = NodeContent (data, node)
29
-
30
- require(data.fileHash == fileHashes(data.name), s " File hash mismatch for ${data.name}" )
31
-
32
- data.name -> (data.fileHash -> nodeContent)
33
-
34
- }.toMap
35
-
36
- def getContents (name : Name , fileHash : FileHash ): NodeContent [T ] = {
37
-
38
- val incomingContents = newContents.get(name).map(_._2)
39
-
40
- lazy val existingContents = fileHashToContent
41
- .get(name)
42
- .flatMap(_.get(fileHash))
43
-
44
- incomingContents.orElse(existingContents).get
45
- }
27
+ val newContents = buildContentMap(proc, newNodes, fileHashes)
28
+ val enrichedFileHashes = newContents.map {
29
+ case (name, content) => name -> content.localData.fileHash
30
+ } ++ fileHashes
46
31
47
32
val globalHashes = mutable.Map .empty[Name , GlobalHash ]
48
33
@@ -51,8 +36,27 @@ class RepoIndex[T](proc: ConfProcessor[T]) extends Logging {
51
36
52
37
if (globalHashes.contains(name)) return globalHashes(name)
53
38
54
- val fileHash = fileHashes(name)
55
- val content = getContents(name, fileHash)
39
+ val fileHash = enrichedFileHashes.get(name) match {
40
+ case Some (hash) => hash
41
+
42
+ // this could be an artifact related to unchanged files on the branch
43
+ // we reach out to content index
44
+ // artifacts are just names with no content - so there should be just one entry
45
+ case None =>
46
+ val hashToContent = fileHashToContent(name)
47
+
48
+ require(hashToContent.nonEmpty, s " Expected 1 entry for artifact $name, found none " )
49
+ require(hashToContent.size == 1 , s " Expected 1 entry for artifact $name, found ${hashToContent.size}" )
50
+
51
+ hashToContent.head._1
52
+ }
53
+
54
+ val content = if (newContents.contains(name)) {
55
+ newContents(name)
56
+ } else {
57
+ // fetch
58
+ fileHashToContent(name)(fileHash)
59
+ }
56
60
57
61
val localHash = content.localData.localHash
58
62
val parents = content.localData.inputs
@@ -70,7 +74,7 @@ class RepoIndex[T](proc: ConfProcessor[T]) extends Logging {
70
74
71
75
logger.info(s " codeString: $codeString" )
72
76
73
- val globalHash = GlobalHash (codeString.hashCode().toHexString )
77
+ val globalHash = GlobalHash (codeString.md5 )
74
78
75
79
globalHashes.update(name, globalHash)
76
80
globalHash
@@ -90,12 +94,12 @@ class RepoIndex[T](proc: ConfProcessor[T]) extends Logging {
90
94
val mainVersions = branchVersionIndex.getOrElse(Branch .main, mutable.Map .empty)
91
95
92
96
val versionUpdates = VersionUpdate .join(newVersions, existingVersions, mainVersions)
93
- VersionUpdate .print(versionUpdates)
94
97
95
98
if (! dryRun) {
96
99
100
+ logger.info(" Not a dry run! Inserting new nodes into the index into branch: " + branch.name)
97
101
newContents.foreach {
98
- case (name, (fileHash, content)) => update(fileHashToContent, name, fileHash, content)
102
+ case (name, content) => update(fileHashToContent, name, content.localData. fileHash, content)
99
103
}
100
104
101
105
val newVersions = globalHashes.map {
@@ -105,7 +109,7 @@ class RepoIndex[T](proc: ConfProcessor[T]) extends Logging {
105
109
name -> version
106
110
}
107
111
108
- branchToFileHash.update(branch, fileHashes )
112
+ branchToFileHash.update(branch, enrichedFileHashes )
109
113
branchVersionIndex.update(branch, newVersions)
110
114
111
115
}
@@ -141,7 +145,7 @@ class RepoIndex[T](proc: ConfProcessor[T]) extends Logging {
141
145
private def pruneContents (): Unit = {
142
146
143
147
// collect unique hashes per name from every branch
144
- val validHashes : mutable.Map [Name , mutable.HashSet [FileHash ]] = innerValues (branchToFileHash)
148
+ val validHashes : mutable.Map [Name , mutable.HashSet [FileHash ]] = innerKeyToValueSet (branchToFileHash)
145
149
146
150
fileHashToContent.retain {
147
151
case (name, fileHashMap) =>
@@ -173,14 +177,12 @@ class RepoIndex[T](proc: ConfProcessor[T]) extends Logging {
173
177
174
178
object RepoIndex {
175
179
176
- private case class NodeContent [T ](localData : LocalData , conf : T )
177
-
178
180
private type TriMap [K1 , K2 , V ] = mutable.Map [K1 , mutable.Map [K2 , V ]]
179
181
180
182
private def update [K1 , K2 , V ](map : TriMap [K1 , K2 , V ], k1 : K1 , k2 : K2 , v : V ): Unit =
181
183
map.getOrElseUpdate(k1, mutable.Map .empty).update(k2, v)
182
184
183
- private def innerValues [K1 , K2 , V ](map : TriMap [K1 , K2 , V ]): mutable.Map [K2 , mutable.HashSet [V ]] = {
185
+ private def innerKeyToValueSet [K1 , K2 , V ](map : TriMap [K1 , K2 , V ]): mutable.Map [K2 , mutable.HashSet [V ]] = {
184
186
val result = mutable.Map .empty[K2 , mutable.HashSet [V ]]
185
187
map.values.foreach { innerMap =>
186
188
innerMap.foreach {
@@ -191,4 +193,54 @@ object RepoIndex {
191
193
result
192
194
}
193
195
196
+ /**
197
+ * Takes data from repo parser and builds a local index for the repo parser
198
+ * We treat inputs and outputs that are not present in FileHashes as artifacts
199
+ * For these artifacts we create additional entries in the result
200
+ */
201
+ def buildContentMap [T >: Null ](proc : ConfProcessor [T ],
202
+ nodes : Seq [T ],
203
+ fileHashes : mutable.Map [Name , FileHash ]): mutable.Map [Name , NodeContent [T ]] = {
204
+
205
+ val contentMap = mutable.Map .empty[Name , NodeContent [T ]]
206
+
207
+ // first pass - update non-artifact contents
208
+ for (
209
+ node <- nodes;
210
+ nodeContent <- proc.nodeContents(node)
211
+ ) {
212
+
213
+ val name = nodeContent.localData.name
214
+ contentMap.update(name, nodeContent)
215
+
216
+ def updateContents (artifactName : Name , isOutput : Boolean ): Unit = {
217
+
218
+ // artifacts are not present in file hashes
219
+ if (fileHashes.contains(artifactName)) return
220
+
221
+ val existingParents = if (contentMap.contains(artifactName)) {
222
+ contentMap(artifactName).localData.inputs
223
+ } else {
224
+ Seq .empty
225
+ }
226
+
227
+ val newParents = if (isOutput) Seq (name) else Seq .empty
228
+
229
+ val parents = (existingParents ++ newParents).distinct
230
+
231
+ val artifactData = LocalData .forArtifact(artifactName, parents)
232
+ val artifactContent = NodeContent [T ](artifactData, null )
233
+
234
+ contentMap.update(artifactName, artifactContent)
235
+
236
+ }
237
+
238
+ nodeContent.localData.outputs.foreach { output => updateContents(output, isOutput = true ) }
239
+ nodeContent.localData.inputs.foreach { input => updateContents(input, isOutput = false ) }
240
+
241
+ }
242
+
243
+ contentMap
244
+ }
245
+
194
246
}
0 commit comments