Skip to content

IQSS/10697 - Improve batch permission indexing #10698

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/release-notes/10697-improve-permission-indexing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
### Reindexing after a role assignment is less memory intensive

Adding/removing a user from a role on a collection, particularly the root collection, could lead to a significant increase in memory use resulting in Dataverse itself failing with an out-of-memory condition. Such changes now consume much less memory.

If you have experienced out-of-memory failures in Dataverse in the past that could have been caused by this problem, you may wish to run a [reindex in place](https://guides.dataverse.org/en/latest/admin/solr-search-index.html#reindex-in-place) to update any out-of-date information.

For more information, see #10697 and #10698.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
public class SolrIndexServiceBean {

private static final Logger logger = Logger.getLogger(SolrIndexServiceBean.class.getCanonicalName());

@EJB
DvObjectServiceBean dvObjectService;
@EJB
Expand Down Expand Up @@ -149,7 +149,7 @@ private List<DvObjectSolrDoc> constructDatasetSolrDocs(Dataset dataset) {
return solrDocs;
}

// private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile) {
// private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile) {
private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile, Map<Long, List<String>> permStringByDatasetVersion) {
List<DvObjectSolrDoc> datafileSolrDocs = new ArrayList<>();
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataFile.getOwner());
Expand All @@ -166,14 +166,14 @@ private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile, Map<L
cachedPerms = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId());
}
if (cachedPerms != null) {
logger.fine("reusing cached perms for file " + dataFile.getId());
logger.finest("reusing cached perms for file " + dataFile.getId());
perms = cachedPerms;
} else if (datasetVersionFileIsAttachedTo.isReleased()) {
logger.fine("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId());
logger.finest("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId());
perms.add(IndexServiceBean.getPublicGroupString());
} else {
// go to the well (slow)
logger.fine("no cached perms, file is not public, finding perms for file " + dataFile.getId());
logger.finest("no cached perms, file is not public, finding perms for file " + dataFile.getId());
perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo);
}
} else {
Expand Down Expand Up @@ -204,13 +204,14 @@ private List<DvObjectSolrDoc> constructDatafileSolrDocsFromDataset(Dataset datas
} else {
perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo);
}

for (FileMetadata fileMetadata : datasetVersionFileIsAttachedTo.getFileMetadatas()) {
Long fileId = fileMetadata.getDataFile().getId();
String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId;
String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState());
String solrId = solrIdStart + solrIdEnd;
DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms);
logger.fine("adding fileid " + fileId);
logger.finest("adding fileid " + fileId);
datafileSolrDocs.add(dataFileSolrDoc);
}
}
Expand Down Expand Up @@ -361,20 +362,19 @@ private void persistToSolr(Collection<SolrInputDocument> docs) throws SolrServer

public IndexResponse indexPermissionsOnSelfAndChildren(long definitionPointId) {
DvObject definitionPoint = dvObjectService.findDvObject(definitionPointId);
if ( definitionPoint == null ) {
if (definitionPoint == null) {
logger.log(Level.WARNING, "Cannot find a DvOpbject with id of {0}", definitionPointId);
return null;
} else {
return indexPermissionsOnSelfAndChildren(definitionPoint);
}
}

/**
* We use the database to determine direct children since there is no
* inheritance
*/
public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) {
List<DvObject> dvObjectsToReindexPermissionsFor = new ArrayList<>();
List<DataFile> filesToReindexAsBatch = new ArrayList<>();
/**
* @todo Re-indexing the definition point itself seems to be necessary
Expand All @@ -383,27 +383,47 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint)

// We don't create a Solr "primary/content" doc for the root dataverse
// so don't create a Solr "permission" doc either.
int i = 0;
int numObjects = 0;
if (definitionPoint.isInstanceofDataverse()) {
Dataverse selfDataverse = (Dataverse) definitionPoint;
if (!selfDataverse.equals(dataverseService.findRootDataverse())) {
dvObjectsToReindexPermissionsFor.add(definitionPoint);
indexPermissionsForOneDvObject(definitionPoint);
numObjects++;
}
List<Dataset> directChildDatasetsOfDvDefPoint = datasetService.findByOwnerId(selfDataverse.getId());
for (Dataset dataset : directChildDatasetsOfDvDefPoint) {
dvObjectsToReindexPermissionsFor.add(dataset);
indexPermissionsForOneDvObject(dataset);
numObjects++;
for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) {
filesToReindexAsBatch.add(datafile);
i++;
if (i % 100 == 0) {
reindexFilesInBatches(filesToReindexAsBatch);
filesToReindexAsBatch.clear();
}
if (i % 1000 == 0) {
logger.fine("Progress: " +i + " files permissions reindexed");
}
}
logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed");
}
} else if (definitionPoint.isInstanceofDataset()) {
dvObjectsToReindexPermissionsFor.add(definitionPoint);
indexPermissionsForOneDvObject(definitionPoint);
numObjects++;
// index files
Dataset dataset = (Dataset) definitionPoint;
for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) {
filesToReindexAsBatch.add(datafile);
i++;
if (i % 100 == 0) {
reindexFilesInBatches(filesToReindexAsBatch);
filesToReindexAsBatch.clear();
}
}
} else {
dvObjectsToReindexPermissionsFor.add(definitionPoint);
indexPermissionsForOneDvObject(definitionPoint);
numObjects++;
}

/**
Expand All @@ -412,64 +432,64 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint)
* @todo Should update timestamps, probably, even thought these are
* files, see https://github.com/IQSS/dataverse/issues/2421
*/
String response = reindexFilesInBatches(filesToReindexAsBatch);

for (DvObject dvObject : dvObjectsToReindexPermissionsFor) {
/**
* @todo do something with this response
*/
IndexResponse indexResponse = indexPermissionsForOneDvObject(dvObject);
}

reindexFilesInBatches(filesToReindexAsBatch);
logger.fine("Reindexed permissions for " + i + " files and " + numObjects + " datasets/collections");
return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint
+ ": " + dvObjectsToReindexPermissionsFor.size()
);
+ ": " + numObjects);
}

private String reindexFilesInBatches(List<DataFile> filesToReindexPermissionsFor) {
List<SolrInputDocument> docs = new ArrayList<>();
Map<Long, List<Long>> byParentId = new HashMap<>();
Map<Long, List<String>> permStringByDatasetVersion = new HashMap<>();
for (DataFile file : filesToReindexPermissionsFor) {
Dataset dataset = (Dataset) file.getOwner();
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) {
boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState());
if (cardShouldExist) {
List<String> cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId());
if (cachedPermission == null) {
logger.fine("no cached permission! Looking it up...");
List<DvObjectSolrDoc> fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) {
Long datasetVersionId = fileSolrDoc.getDatasetVersionId();
if (datasetVersionId != null) {
permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions());
int i = 0;
try {
for (DataFile file : filesToReindexPermissionsFor) {
Dataset dataset = (Dataset) file.getOwner();
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) {
boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState());
if (cardShouldExist) {
List<String> cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId());
if (cachedPermission == null) {
logger.finest("no cached permission! Looking it up...");
List<DvObjectSolrDoc> fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) {
Long datasetVersionId = fileSolrDoc.getDatasetVersionId();
if (datasetVersionId != null) {
permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions());
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
docs.add(solrDoc);
i++;
}
}
} else {
logger.finest("cached permission is " + cachedPermission);
List<DvObjectSolrDoc> fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) {
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
docs.add(solrDoc);
i++;
}
}
} else {
logger.fine("cached permission is " + cachedPermission);
List<DvObjectSolrDoc> fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) {
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
docs.add(solrDoc);
if (i % 20 == 0) {
persistToSolr(docs);
docs = new ArrayList<>();
}
}
}
Long parent = file.getOwner().getId();
List<Long> existingList = byParentId.get(parent);
if (existingList == null) {
List<Long> empty = new ArrayList<>();
byParentId.put(parent, empty);
} else {
List<Long> updatedList = existingList;
updatedList.add(file.getId());
byParentId.put(parent, updatedList);
}
}
Long parent = file.getOwner().getId();
List<Long> existingList = byParentId.get(parent);
if (existingList == null) {
List<Long> empty = new ArrayList<>();
byParentId.put(parent, empty);
} else {
List<Long> updatedList = existingList;
updatedList.add(file.getId());
byParentId.put(parent, updatedList);
}
}
try {

persistToSolr(docs);
return " " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents ";
} catch (SolrServerException | IOException ex) {
Expand Down Expand Up @@ -517,29 +537,26 @@ public JsonObjectBuilder deleteAllFromSolrAndResetIndexTimes() throws SolrServer
}

/**
*
*
* @return A list of dvobject ids that should have their permissions
* re-indexed because Solr was down when a permission was added. The permission
* should be added to Solr. The id of the permission contains the type of
* DvObject and the primary key of the dvObject.
* DvObjects of type DataFile are currently skipped because their index
* time isn't stored in the database, since they are indexed along
* with their parent dataset (this may change).
* re-indexed because Solr was down when a permission was added. The
* permission should be added to Solr. The id of the permission contains the
* type of DvObject and the primary key of the dvObject. DvObjects of type
* DataFile are currently skipped because their index time isn't stored in
* the database, since they are indexed along with their parent dataset
* (this may change).
*/
public List<Long> findPermissionsInDatabaseButStaleInOrMissingFromSolr() {
List<Long> indexingRequired = new ArrayList<>();
long rootDvId = dataverseService.findRootDataverse().getId();
List<Long> missingDataversePermissionIds = dataverseService.findIdStalePermission();
List<Long> missingDatasetPermissionIds = datasetService.findIdStalePermission();
for (Long id : missingDataversePermissionIds) {
for (Long id : missingDataversePermissionIds) {
if (!id.equals(rootDvId)) {
indexingRequired.add(id);
indexingRequired.add(id);
}
}
indexingRequired.addAll(missingDatasetPermissionIds);
return indexingRequired;
}


}
Loading