-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Search suffix tree implementation #48652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 99 commits
5425082
54a7b60
8622670
01162fe
09e8aa7
fa81e13
e33142f
30424a9
1d11ed2
b502edc
c0e38b8
16e4078
955cdcb
d7c8d88
d07940d
84d3231
eb6b371
6002101
131c303
0a89089
def772b
0806fb0
c660312
24a59b7
146d51b
a2cdaf9
f6dbecf
bda1e34
2508d55
c32d57f
f4812c0
de0b467
c17a95b
0f94c05
fc54faa
96c93e7
81dffdd
25909ba
b85cfd4
5de310c
d51a4c3
9e9b574
4fa8390
8e583cb
56a1e8a
309556a
f7528f4
8b5b77f
a6b7939
8b41e88
4e273df
57af9b1
8b26a1f
af43c93
615a237
2ededdd
02f562d
e894a7f
54dcea8
8bb5b1e
b331193
18de46d
a91ffc9
8277d37
d1d028a
e829066
3b69cf4
8951708
4c4f1dc
050a320
017e9f6
4d5ad3d
1dc2a5e
fdac05f
e8fdd5c
418e2cd
49692fd
1f90a13
ee553da
29a1934
1aec94a
55acdc5
a1103a3
41207c6
736650f
4d115ed
35b2555
6f5be58
34d2dbd
6160173
fabe796
3539a57
fdf63af
fa68c64
162ad83
fd76f43
7c44fa7
29e5288
de7f3b5
f757fd2
59d2562
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,8 @@ import useKeyboardShortcut from '@hooks/useKeyboardShortcut'; | |
import useLocalize from '@hooks/useLocalize'; | ||
import useResponsiveLayout from '@hooks/useResponsiveLayout'; | ||
import useThemeStyles from '@hooks/useThemeStyles'; | ||
import Timing from '@libs/actions/Timing'; | ||
import FastSearch from '@libs/FastSearch'; | ||
import Log from '@libs/Log'; | ||
import * as OptionsListUtils from '@libs/OptionsListUtils'; | ||
import type {OptionData} from '@libs/ReportUtils'; | ||
|
@@ -56,6 +58,49 @@ function SearchRouter() { | |
return OptionsListUtils.getSearchOptions(options, '', betas ?? []); | ||
}, [areOptionsInitialized, betas, options]); | ||
|
||
/** | ||
* Builds a suffix tree and returns a function to search in it. | ||
*/ | ||
const findInSearchTree = useMemo(() => { | ||
const fastSearch = FastSearch.createFastSearch([ | ||
{ | ||
data: searchOptions.personalDetails, | ||
toSearchableString: (option) => { | ||
const displayName = option.participantsList?.[0]?.displayName ?? ''; | ||
return [option.login ?? '', option.login !== displayName ? displayName : ''].join(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it matter that we will have a string with like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. empty strings won't be searchable. No characters would be added for that to the search tree, thus they'd never be retrievable |
||
}, | ||
}, | ||
{ | ||
data: searchOptions.recentReports, | ||
toSearchableString: (option) => { | ||
const searchStringForTree = [option.text ?? '', option.login ?? '']; | ||
|
||
if (option.isThread) { | ||
if (option.alternateText) { | ||
searchStringForTree.push(option.alternateText); | ||
} | ||
} else if (!!option.isChatRoom || !!option.isPolicyExpenseChat) { | ||
if (option.subtitle) { | ||
searchStringForTree.push(option.subtitle); | ||
} | ||
} | ||
|
||
return searchStringForTree.join(); | ||
}, | ||
}, | ||
Comment on lines
+71
to
+96
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NAB: considering we're doing the same thing in 2 places, we could all this memo in a single function on OptionsListUtils. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 100%, this is actually what i want to do in the next PR! I wouldn't want to change that now, because:
is that fine with you? 😊 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems fine to me! |
||
]); | ||
function search(searchInput: string) { | ||
const [personalDetails, recentReports] = fastSearch.search(searchInput); | ||
|
||
return { | ||
personalDetails, | ||
recentReports, | ||
}; | ||
} | ||
|
||
return search; | ||
}, [searchOptions.personalDetails, searchOptions.recentReports]); | ||
|
||
const filteredOptions = useMemo(() => { | ||
if (debouncedInputValue.trim() === '') { | ||
return { | ||
|
@@ -65,14 +110,26 @@ function SearchRouter() { | |
}; | ||
} | ||
|
||
const newOptions = OptionsListUtils.filterOptions(searchOptions, debouncedInputValue, {sortByReportTypeInSearch: true, preferChatroomsOverThreads: true}); | ||
Timing.start(CONST.TIMING.SEARCH_FILTER_OPTIONS); | ||
const newOptions = findInSearchTree(debouncedInputValue); | ||
Timing.end(CONST.TIMING.SEARCH_FILTER_OPTIONS); | ||
|
||
return { | ||
const recentReports = newOptions.recentReports.concat(newOptions.personalDetails); | ||
|
||
const userToInvite = OptionsListUtils.pickUserToInvite({ | ||
canInviteUser: true, | ||
recentReports: newOptions.recentReports, | ||
personalDetails: newOptions.personalDetails, | ||
userToInvite: newOptions.userToInvite, | ||
searchValue: debouncedInputValue, | ||
optionsToExclude: [{login: CONST.EMAIL.NOTIFICATIONS}], | ||
}); | ||
|
||
return { | ||
recentReports, | ||
personalDetails: [], | ||
userToInvite, | ||
}; | ||
}, [debouncedInputValue, searchOptions]); | ||
}, [debouncedInputValue, findInSearchTree]); | ||
|
||
const recentReports: OptionData[] = useMemo(() => { | ||
const currentSearchOptions = debouncedInputValue === '' ? searchOptions : filteredOptions; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
/* eslint-disable rulesdir/prefer-at */ | ||
import CONST from '@src/CONST'; | ||
import Timing from './actions/Timing'; | ||
import SuffixUkkonenTree from './SuffixUkkonenTree'; | ||
|
||
type SearchableData<T> = { | ||
/** | ||
* The data that should be searchable | ||
*/ | ||
data: T[]; | ||
/** | ||
* A function that generates a string from a data entry. The string's value is used for searching. | ||
* If you have multiple fields that should be searchable, simply concat them to the string and return it. | ||
*/ | ||
toSearchableString: (data: T) => string; | ||
}; | ||
|
||
// There are certain characters appear very often in our search data (email addresses), which we don't need to search for. | ||
const charSetToSkip = new Set(['@', '.', '#', '$', '%', '&', '*', '+', '-', '/', ':', ';', '<', '=', '>', '?', '_', '~', '!', ' ']); | ||
|
||
/** | ||
* Creates a new "FastSearch" instance. "FastSearch" uses a suffix tree to search for substrings in a list of strings. | ||
* You can provide multiple datasets. The search results will be returned for each dataset. | ||
* | ||
* Note: Creating a FastSearch instance with a lot of data is computationally expensive. You should create an instance once and reuse it. | ||
* Searches will be very fast though, even with a lot of data. | ||
*/ | ||
function createFastSearch<T>(dataSets: Array<SearchableData<T>>) { | ||
Timing.start(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES); | ||
const maxNumericListSize = 400_000; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we exceed this? Or is that pretty unlikely? What happens if we do? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question. I made this number to be based on the most extreme case i have seen so far (the search string generated for ten thousands of reports and personal details was below 400.000, don't remember the exact number anymore). Let me brainstorm quickly with Szymon if we can do better! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We think it would be best to implement a resize mechanism. If we hit the limit, we'd resize the arrays to be bigger. However, that comes with a performance overhead. So we should pick a default size that makes sense for most users (400_00 might actually be a bit too much for the average case, but for some extreme cases it might be too little). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Another solution for performance improvements on this could also be to store the array's arraybuffer in mmkv and rehydrate from there [mmkv has first class support for ArrayBuffers], but There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's fine to keep it as 400K for now, but have a follow-up issue to make it resizable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, thanks! I think this is something we can wait and see about - my initial thinking is that we still don't have any pagination for either personal details or reports. And we have also spent a lot of time tweaking things for some very high capacity usage. Though, it was hard for me to tell how many personal details or reports we are talking about here when we'd be hitting that limit. If it's over 10k reports then nobody actually has that AFAIK. |
||
// The user might provide multiple data sets, but internally, the search values will be stored in this one list: | ||
let concatenatedNumericList = new Uint8Array(maxNumericListSize); | ||
// Here we store the index of the data item in the original data list, so we can map the found occurrences back to the original data: | ||
const occurrenceToIndex = new Uint32Array(maxNumericListSize * 4); | ||
// As we are working with ArrayBuffers, we need to keep track of the current offset: | ||
const offset = {value: 1}; | ||
// We store the last offset for a dataSet, so we can map the found occurrences to the correct dataSet: | ||
const listOffsets: number[] = []; | ||
|
||
for (const {data, toSearchableString} of dataSets) { | ||
// Performance critical: the array parameters are passed by reference, so we don't have to create new arrays every time: | ||
dataToNumericRepresentation(concatenatedNumericList, occurrenceToIndex, offset, {data, toSearchableString}); | ||
listOffsets.push(offset.value); | ||
} | ||
concatenatedNumericList[offset.value++] = SuffixUkkonenTree.END_CHAR_CODE; | ||
listOffsets[listOffsets.length - 1] = offset.value; | ||
Timing.end(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES); | ||
|
||
// The list might be larger than necessary, so we clamp it to the actual size: | ||
concatenatedNumericList = concatenatedNumericList.slice(0, offset.value); | ||
|
||
// Create & build the suffix tree: | ||
Timing.start(CONST.TIMING.SEARCH_MAKE_TREE); | ||
const tree = SuffixUkkonenTree.makeTree(concatenatedNumericList); | ||
Timing.end(CONST.TIMING.SEARCH_MAKE_TREE); | ||
|
||
Timing.start(CONST.TIMING.SEARCH_BUILD_TREE); | ||
tree.build(); | ||
Timing.end(CONST.TIMING.SEARCH_BUILD_TREE); | ||
|
||
/** | ||
* Searches for the given input and returns results for each dataset. | ||
*/ | ||
function search(searchInput: string): T[][] { | ||
const cleanedSearchString = cleanString(searchInput); | ||
const {numeric} = SuffixUkkonenTree.stringToNumeric(cleanedSearchString, { | ||
charSetToSkip, | ||
// stringToNumeric might return a list that is larger than necessary, so we clamp it to the actual size | ||
// (otherwise the search could fail as we include in our search empty array values): | ||
clamp: true, | ||
}); | ||
const result = tree.findSubstring(Array.from(numeric)); | ||
|
||
const resultsByDataSet = Array.from({length: dataSets.length}, () => new Set<T>()); | ||
// eslint-disable-next-line @typescript-eslint/prefer-for-of | ||
for (let i = 0; i < result.length; i++) { | ||
const occurrenceIndex = result[i]; | ||
const itemIndexInDataSet = occurrenceToIndex[occurrenceIndex]; | ||
const dataSetIndex = listOffsets.findIndex((listOffset) => occurrenceIndex < listOffset); | ||
|
||
if (dataSetIndex === -1) { | ||
throw new Error(`[FastSearch] The occurrence index ${occurrenceIndex} is not in any dataset`); | ||
} | ||
const item = dataSets[dataSetIndex].data[itemIndexInDataSet]; | ||
if (!item) { | ||
throw new Error(`[FastSearch] The item with index ${itemIndexInDataSet} in dataset ${dataSetIndex} is not defined`); | ||
} | ||
resultsByDataSet[dataSetIndex].add(item); | ||
} | ||
|
||
return resultsByDataSet.map((set) => Array.from(set)); | ||
} | ||
|
||
return { | ||
search, | ||
}; | ||
} | ||
|
||
/** | ||
* The suffix tree can only store string like values, and internally stores those as numbers. | ||
* This function converts the user data (which are most likely objects) to a numeric representation. | ||
* Additionally a list of the original data and their index position in the numeric list is created, which is used to map the found occurrences back to the original data. | ||
*/ | ||
function dataToNumericRepresentation<T>(concatenatedNumericList: Uint8Array, occurrenceToIndex: Uint32Array, offset: {value: number}, {data, toSearchableString}: SearchableData<T>): void { | ||
data.forEach((option, index) => { | ||
const searchStringForTree = toSearchableString(option); | ||
const cleanedSearchStringForTree = cleanString(searchStringForTree); | ||
|
||
if (cleanedSearchStringForTree.length === 0) { | ||
return; | ||
} | ||
|
||
SuffixUkkonenTree.stringToNumeric(cleanedSearchStringForTree, { | ||
charSetToSkip, | ||
out: { | ||
outArray: concatenatedNumericList, | ||
offset, | ||
outOccurrenceToIndex: occurrenceToIndex, | ||
index, | ||
}, | ||
}); | ||
// eslint-disable-next-line no-param-reassign | ||
occurrenceToIndex[offset.value] = index; | ||
// eslint-disable-next-line no-param-reassign | ||
concatenatedNumericList[offset.value++] = SuffixUkkonenTree.DELIMITER_CHAR_CODE; | ||
}); | ||
} | ||
|
||
/** | ||
* Everything in the tree is treated as lowercase. | ||
*/ | ||
function cleanString(input: string) { | ||
return input.toLowerCase(); | ||
} | ||
|
||
const FastSearch = { | ||
createFastSearch, | ||
}; | ||
|
||
export default FastSearch; |
Uh oh!
There was an error while loading. Please reload this page.