Skip to content

Commit d039a10

Browse files
qqmyerspdurbin
andauthored
IQSS/10697 - Improve batch permission indexing (#10698)
* reindex batches of 20 files instead of all at once * Also only keep 100 files in list at a time * release note * Just do collections/datasets as you go Avoids keeping everything in memory, also helps in tracking progress as you can see the permissionindextime getting updated per dataset. * fix merge issues, add logging * put comments back to how they were #10697 * reduce logging #10697 * rename release note and add PR number #10697 * fix logging - finest for per file, space in message * adding a space in log message - per review --------- Co-authored-by: Philip Durbin <[email protected]>
1 parent c44ad65 commit d039a10

File tree

2 files changed

+91
-67
lines changed

2 files changed

+91
-67
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
### Reindexing after a role assignment is less memory intensive
2+
3+
Adding/removing a user from a role on a collection, particularly the root collection, could lead to a significant increase in memory use resulting in Dataverse itself failing with an out-of-memory condition. Such changes now consume much less memory.
4+
5+
If you have experienced out-of-memory failures in Dataverse in the past that could have been caused by this problem, you may wish to run a [reindex in place](https://guides.dataverse.org/en/latest/admin/solr-search-index.html#reindex-in-place) to update any out-of-date information.
6+
7+
For more information, see #10697 and #10698.

src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java

+84-67
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
public class SolrIndexServiceBean {
3535

3636
private static final Logger logger = Logger.getLogger(SolrIndexServiceBean.class.getCanonicalName());
37-
37+
3838
@EJB
3939
DvObjectServiceBean dvObjectService;
4040
@EJB
@@ -149,7 +149,7 @@ private List<DvObjectSolrDoc> constructDatasetSolrDocs(Dataset dataset) {
149149
return solrDocs;
150150
}
151151

152-
// private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile) {
152+
// private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile) {
153153
private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile, Map<Long, List<String>> permStringByDatasetVersion) {
154154
List<DvObjectSolrDoc> datafileSolrDocs = new ArrayList<>();
155155
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataFile.getOwner());
@@ -166,14 +166,14 @@ private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile, Map<L
166166
cachedPerms = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId());
167167
}
168168
if (cachedPerms != null) {
169-
logger.fine("reusing cached perms for file " + dataFile.getId());
169+
logger.finest("reusing cached perms for file " + dataFile.getId());
170170
perms = cachedPerms;
171171
} else if (datasetVersionFileIsAttachedTo.isReleased()) {
172-
logger.fine("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId());
172+
logger.finest("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId());
173173
perms.add(IndexServiceBean.getPublicGroupString());
174174
} else {
175175
// go to the well (slow)
176-
logger.fine("no cached perms, file is not public, finding perms for file " + dataFile.getId());
176+
logger.finest("no cached perms, file is not public, finding perms for file " + dataFile.getId());
177177
perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo);
178178
}
179179
} else {
@@ -204,13 +204,14 @@ private List<DvObjectSolrDoc> constructDatafileSolrDocsFromDataset(Dataset datas
204204
} else {
205205
perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo);
206206
}
207+
207208
for (FileMetadata fileMetadata : datasetVersionFileIsAttachedTo.getFileMetadatas()) {
208209
Long fileId = fileMetadata.getDataFile().getId();
209210
String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId;
210211
String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState());
211212
String solrId = solrIdStart + solrIdEnd;
212213
DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms);
213-
logger.fine("adding fileid " + fileId);
214+
logger.finest("adding fileid " + fileId);
214215
datafileSolrDocs.add(dataFileSolrDoc);
215216
}
216217
}
@@ -361,20 +362,19 @@ private void persistToSolr(Collection<SolrInputDocument> docs) throws SolrServer
361362

362363
public IndexResponse indexPermissionsOnSelfAndChildren(long definitionPointId) {
363364
DvObject definitionPoint = dvObjectService.findDvObject(definitionPointId);
364-
if ( definitionPoint == null ) {
365+
if (definitionPoint == null) {
365366
logger.log(Level.WARNING, "Cannot find a DvOpbject with id of {0}", definitionPointId);
366367
return null;
367368
} else {
368369
return indexPermissionsOnSelfAndChildren(definitionPoint);
369370
}
370371
}
371-
372+
372373
/**
373374
* We use the database to determine direct children since there is no
374375
* inheritance
375376
*/
376377
public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) {
377-
List<DvObject> dvObjectsToReindexPermissionsFor = new ArrayList<>();
378378
List<DataFile> filesToReindexAsBatch = new ArrayList<>();
379379
/**
380380
* @todo Re-indexing the definition point itself seems to be necessary
@@ -383,27 +383,47 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint)
383383

384384
// We don't create a Solr "primary/content" doc for the root dataverse
385385
// so don't create a Solr "permission" doc either.
386+
int i = 0;
387+
int numObjects = 0;
386388
if (definitionPoint.isInstanceofDataverse()) {
387389
Dataverse selfDataverse = (Dataverse) definitionPoint;
388390
if (!selfDataverse.equals(dataverseService.findRootDataverse())) {
389-
dvObjectsToReindexPermissionsFor.add(definitionPoint);
391+
indexPermissionsForOneDvObject(definitionPoint);
392+
numObjects++;
390393
}
391394
List<Dataset> directChildDatasetsOfDvDefPoint = datasetService.findByOwnerId(selfDataverse.getId());
392395
for (Dataset dataset : directChildDatasetsOfDvDefPoint) {
393-
dvObjectsToReindexPermissionsFor.add(dataset);
396+
indexPermissionsForOneDvObject(dataset);
397+
numObjects++;
394398
for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) {
395399
filesToReindexAsBatch.add(datafile);
400+
i++;
401+
if (i % 100 == 0) {
402+
reindexFilesInBatches(filesToReindexAsBatch);
403+
filesToReindexAsBatch.clear();
404+
}
405+
if (i % 1000 == 0) {
406+
logger.fine("Progress: " +i + " files permissions reindexed");
407+
}
396408
}
409+
logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed");
397410
}
398411
} else if (definitionPoint.isInstanceofDataset()) {
399-
dvObjectsToReindexPermissionsFor.add(definitionPoint);
412+
indexPermissionsForOneDvObject(definitionPoint);
413+
numObjects++;
400414
// index files
401415
Dataset dataset = (Dataset) definitionPoint;
402416
for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) {
403417
filesToReindexAsBatch.add(datafile);
418+
i++;
419+
if (i % 100 == 0) {
420+
reindexFilesInBatches(filesToReindexAsBatch);
421+
filesToReindexAsBatch.clear();
422+
}
404423
}
405424
} else {
406-
dvObjectsToReindexPermissionsFor.add(definitionPoint);
425+
indexPermissionsForOneDvObject(definitionPoint);
426+
numObjects++;
407427
}
408428

409429
/**
@@ -412,64 +432,64 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint)
412432
* @todo Should update timestamps, probably, even thought these are
413433
* files, see https://github.com/IQSS/dataverse/issues/2421
414434
*/
415-
String response = reindexFilesInBatches(filesToReindexAsBatch);
416-
417-
for (DvObject dvObject : dvObjectsToReindexPermissionsFor) {
418-
/**
419-
* @todo do something with this response
420-
*/
421-
IndexResponse indexResponse = indexPermissionsForOneDvObject(dvObject);
422-
}
423-
435+
reindexFilesInBatches(filesToReindexAsBatch);
436+
logger.fine("Reindexed permissions for " + i + " files and " + numObjects + " datasets/collections");
424437
return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint
425-
+ ": " + dvObjectsToReindexPermissionsFor.size()
426-
);
438+
+ ": " + numObjects);
427439
}
428440

429441
private String reindexFilesInBatches(List<DataFile> filesToReindexPermissionsFor) {
430442
List<SolrInputDocument> docs = new ArrayList<>();
431443
Map<Long, List<Long>> byParentId = new HashMap<>();
432444
Map<Long, List<String>> permStringByDatasetVersion = new HashMap<>();
433-
for (DataFile file : filesToReindexPermissionsFor) {
434-
Dataset dataset = (Dataset) file.getOwner();
435-
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
436-
for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) {
437-
boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState());
438-
if (cardShouldExist) {
439-
List<String> cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId());
440-
if (cachedPermission == null) {
441-
logger.fine("no cached permission! Looking it up...");
442-
List<DvObjectSolrDoc> fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
443-
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) {
444-
Long datasetVersionId = fileSolrDoc.getDatasetVersionId();
445-
if (datasetVersionId != null) {
446-
permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions());
445+
int i = 0;
446+
try {
447+
for (DataFile file : filesToReindexPermissionsFor) {
448+
Dataset dataset = (Dataset) file.getOwner();
449+
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
450+
for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) {
451+
boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState());
452+
if (cardShouldExist) {
453+
List<String> cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId());
454+
if (cachedPermission == null) {
455+
logger.finest("no cached permission! Looking it up...");
456+
List<DvObjectSolrDoc> fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
457+
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) {
458+
Long datasetVersionId = fileSolrDoc.getDatasetVersionId();
459+
if (datasetVersionId != null) {
460+
permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions());
461+
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
462+
docs.add(solrDoc);
463+
i++;
464+
}
465+
}
466+
} else {
467+
logger.finest("cached permission is " + cachedPermission);
468+
List<DvObjectSolrDoc> fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
469+
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) {
447470
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
448471
docs.add(solrDoc);
472+
i++;
449473
}
450474
}
451-
} else {
452-
logger.fine("cached permission is " + cachedPermission);
453-
List<DvObjectSolrDoc> fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
454-
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) {
455-
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
456-
docs.add(solrDoc);
475+
if (i % 20 == 0) {
476+
persistToSolr(docs);
477+
docs = new ArrayList<>();
457478
}
458479
}
459480
}
481+
Long parent = file.getOwner().getId();
482+
List<Long> existingList = byParentId.get(parent);
483+
if (existingList == null) {
484+
List<Long> empty = new ArrayList<>();
485+
byParentId.put(parent, empty);
486+
} else {
487+
List<Long> updatedList = existingList;
488+
updatedList.add(file.getId());
489+
byParentId.put(parent, updatedList);
490+
}
460491
}
461-
Long parent = file.getOwner().getId();
462-
List<Long> existingList = byParentId.get(parent);
463-
if (existingList == null) {
464-
List<Long> empty = new ArrayList<>();
465-
byParentId.put(parent, empty);
466-
} else {
467-
List<Long> updatedList = existingList;
468-
updatedList.add(file.getId());
469-
byParentId.put(parent, updatedList);
470-
}
471-
}
472-
try {
492+
473493
persistToSolr(docs);
474494
return " " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents ";
475495
} catch (SolrServerException | IOException ex) {
@@ -517,29 +537,26 @@ public JsonObjectBuilder deleteAllFromSolrAndResetIndexTimes() throws SolrServer
517537
}
518538

519539
/**
520-
*
521-
*
522540
* @return A list of dvobject ids that should have their permissions
523-
* re-indexed because Solr was down when a permission was added. The permission
524-
* should be added to Solr. The id of the permission contains the type of
525-
* DvObject and the primary key of the dvObject.
526-
* DvObjects of type DataFile are currently skipped because their index
527-
* time isn't stored in the database, since they are indexed along
528-
* with their parent dataset (this may change).
541+
* re-indexed because Solr was down when a permission was added. The
542+
* permission should be added to Solr. The id of the permission contains the
543+
* type of DvObject and the primary key of the dvObject. DvObjects of type
544+
* DataFile are currently skipped because their index time isn't stored in
545+
* the database, since they are indexed along with their parent dataset
546+
* (this may change).
529547
*/
530548
public List<Long> findPermissionsInDatabaseButStaleInOrMissingFromSolr() {
531549
List<Long> indexingRequired = new ArrayList<>();
532550
long rootDvId = dataverseService.findRootDataverse().getId();
533551
List<Long> missingDataversePermissionIds = dataverseService.findIdStalePermission();
534552
List<Long> missingDatasetPermissionIds = datasetService.findIdStalePermission();
535-
for (Long id : missingDataversePermissionIds) {
553+
for (Long id : missingDataversePermissionIds) {
536554
if (!id.equals(rootDvId)) {
537-
indexingRequired.add(id);
555+
indexingRequired.add(id);
538556
}
539557
}
540558
indexingRequired.addAll(missingDatasetPermissionIds);
541559
return indexingRequired;
542560
}
543561

544-
545562
}

0 commit comments

Comments
 (0)