Skip to content

Commit da21407

Browse files
authored
Revise perfect hash to align with libgrape-lite's pthash (#1992)
Fixes #1852 Signed-off-by: vegetableysm <[email protected]>
1 parent a9344ae commit da21407

File tree

12 files changed

+247
-1802
lines changed

12 files changed

+247
-1802
lines changed

.gitmodules

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
path = modules/graph/thirdparty/GraphAr
4343
url = https://github.com/alibaba/GraphAr.git
4444
shallow = true
45-
[submodule "modules/graph/thirdparty/libgrape-lite"]
46-
path = modules/graph/thirdparty/libgrape-lite
45+
[submodule "thirdparty/libgrape-lite"]
46+
path = thirdparty/libgrape-lite
4747
url = https://github.com/alibaba/libgrape-lite.git
4848
shallow = true
4949
[submodule "modules/graph/thirdparty/powturbo"]

NOTICE.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,6 @@ This product includes software from the ClickHouse project
5252
* Copyright 2016-2022 ClickHouse, Inc.
5353
* https://github.com/ClickHouse/ClickHouse
5454

55-
This product includes software from the BBHash project
56-
* Copyright (c) 2015 Guillaume Rizk
57-
* https://github.com/rizkg/BBHash
58-
5955
This product includes software from the rax project (BSD, 2-clause)
6056
* Copyright (c) 2017-2019, Salvatore Sanfilippo <antirez at gmail dot com>
6157
* https://github.com/antirez/rax

README.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,6 @@ We thank the following excellent open-source projects:
297297
- `skywalking-infra-e2e <https://github.com/apache/skywalking-infra-e2e>`_ A generation End-to-End Testing framework.
298298
- `skywalking-swck <https://github.com/apache/skywalking-swck>`_ A kubernetes operator for the Apache Skywalking.
299299
- `wyhash <https://github.com/alainesp/wy>`_, C++ wrapper around wyhash and wyrand.
300-
- `BBHash <https://github.com/rizkg/BBHash>`_, a fast, minimal-memory perfect hash function.
301300
- `rax <https://github.com/antirez/rax>`_, an ANSI C radix tree implementation.
302301
- `MurmurHash3 <https://github.com/aappleby/smhasher>`_, a fast non-cryptographic hash function.
303302

modules/basic/CMakeLists.txt

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,34 @@ file(GLOB_RECURSE BASIC_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
4242

4343
add_library(vineyard_basic ${BASIC_SRC_FILES})
4444
target_add_debuginfo(vineyard_basic)
45+
find_package(MPI REQUIRED)
4546
target_link_libraries(vineyard_basic PUBLIC vineyard_client
4647
${ARROW_SHARED_LIB}
4748
${GLOG_LIBRARIES}
49+
${MPI_CXX_LIBRARIES}
4850
)
49-
target_include_directories(vineyard_basic PUBLIC ${ARROW_INCLUDE_DIR})
51+
target_include_directories(vineyard_basic PUBLIC ${ARROW_INCLUDE_DIR} ${MPI_CXX_INCLUDE_PATH})
52+
53+
find_package(libgrapelite 0.3.4 QUIET)
54+
if(LIBGRAPELITE_INCLUDE_DIRS)
55+
message(STATUS "-- Found libgrape-lite: ${LIBGRAPELITE_INCLUDE_DIRS}")
56+
target_include_directories(vineyard_basic PUBLIC ${LIBGRAPELITE_INCLUDE_DIRS})
57+
else()
58+
# use bundled libgrape-lite
59+
message(STATUS "-- Building libgrape-lite from submodule: ${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite")
60+
set(BUILD_LIBGRAPELITE_DOCS OFF CACHE BOOL "no libgrape-lite docs")
61+
set(BUILD_LIBGRAPELITE_TESTS OFF CACHE BOOL "no libgrape-lite tests")
62+
# use `add_subdirectory` to use the same CMAKE_BUILD_TYPE with vineyard itself and
63+
# ensure the libgrapelite-targets-{debug/release}.cmake been generated during installation.
64+
add_subdirectory("${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite"
65+
"${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite"
66+
)
67+
target_include_directories(vineyard_basic PUBLIC
68+
$<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite>
69+
$<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite/thirdparty>
70+
$<INSTALL_INTERFACE:include>
71+
)
72+
endif()
5073

5174
# install bundled thirdparty: flat_hash_map
5275
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/flat_hash_map
@@ -64,13 +87,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/wyhash
6487
PATTERN "*.hpp" # select C++ template header files
6588
)
6689

67-
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/BBHash
68-
DESTINATION include/vineyard/contrib # target directory
69-
FILES_MATCHING # install only matched files
70-
PATTERN "*.h" # select header files
71-
PATTERN "*.hpp" # select C++ template header files
72-
)
73-
7490
# install bundled thirdparty: cityhash
7591
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cityhash
7692
DESTINATION include/vineyard/contrib # target directory

modules/basic/ds/hashmap.h

Lines changed: 69 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,7 @@ limitations under the License.
2929
#include "client/ds/blob.h"
3030
#include "client/ds/i_object.h"
3131
#include "common/util/arrow.h" // IWYU pragma: keep
32-
33-
#ifdef __GNUC__
34-
#pragma GCC diagnostic push
35-
#pragma GCC diagnostic ignored "-Wunused-variable"
36-
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
37-
#endif
38-
#include "BBHash/BooPHF.h"
39-
#ifdef __GNUC__
40-
#pragma GCC diagnostic pop
41-
#endif
32+
#include "grape/vertex_map/idxers/pthash_idxer.h"
4233

4334
namespace vineyard {
4435

@@ -229,8 +220,6 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
229220
public:
230221
static_assert(std::is_pod<V>::value, "V in perfect hashmap must be POD type");
231222

232-
typedef boomphf::SingleHashFunctor<K> hasher_t;
233-
234223
explicit PerfectHashmapBuilder(Client& client)
235224
: PerfectHashmapBaseBuilder<K, V>(client) {}
236225

@@ -248,12 +237,21 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
248237
const V* values, const size_t n_elements) {
249238
this->set_num_elements_(n_elements);
250239
this->set_ph_keys_(keys);
251-
RETURN_ON_ERROR(detail::boomphf::build_keys(
252-
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements));
240+
for (size_t i = 0; i < n_elements; ++i) {
241+
this->builder_.add((reinterpret_cast<const K*>(keys->data()))[i]);
242+
}
243+
244+
this->builder_.buildPhf();
245+
std::unique_ptr<BlobWriter> writer;
246+
size_t serialize_size = this->builder_.getSerializeSize();
247+
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
248+
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
249+
writer->Seal(client, buf);
250+
253251
return this->allocateValues(
254252
client, n_elements, [&](V* shuffled_values) -> Status {
255-
return detail::boomphf::build_values(
256-
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements,
253+
return detail::perfect_hash::build_values(
254+
idxer_, reinterpret_cast<const K*>(keys->data()), n_elements,
257255
values, shuffled_values);
258256
});
259257
}
@@ -266,11 +264,27 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
266264
const V* values, const size_t n_elements) {
267265
this->set_num_elements_(n_elements);
268266
this->set_ph_keys_(keys);
269-
RETURN_ON_ERROR(detail::boomphf::build_keys(bphf_, keys->GetArray()));
267+
for (auto iter =
268+
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
269+
keys->GetArray()->begin());
270+
iter !=
271+
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
272+
keys->GetArray()->end());
273+
iter++) {
274+
this->builder_.add(*iter);
275+
}
276+
277+
this->builder_.buildPhf();
278+
std::unique_ptr<BlobWriter> writer;
279+
size_t serialize_size = this->builder_.getSerializeSize();
280+
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
281+
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
282+
writer->Seal(client, buf);
283+
270284
return this->allocateValues(
271285
client, n_elements, [&](V* shuffled_values) -> Status {
272-
return detail::boomphf::build_values(bphf_, keys->GetArray(), values,
273-
shuffled_values);
286+
return detail::perfect_hash::build_values(idxer_, keys->GetArray(),
287+
values, shuffled_values);
274288
});
275289
return Status::OK();
276290
}
@@ -289,12 +303,21 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
289303
const V begin_value, const size_t n_elements) {
290304
this->set_num_elements_(n_elements);
291305
this->set_ph_keys_(keys);
292-
RETURN_ON_ERROR(detail::boomphf::build_keys(
293-
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements));
306+
for (size_t i = 0; i < n_elements; ++i) {
307+
this->builder_.add((reinterpret_cast<const K*>(keys->data()))[i]);
308+
}
309+
310+
this->builder_.buildPhf();
311+
std::unique_ptr<BlobWriter> writer;
312+
size_t serialize_size = this->builder_.getSerializeSize();
313+
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
314+
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
315+
writer->Seal(client, buf);
316+
294317
return this->allocateValues(
295318
client, n_elements, [&](V* shuffled_values) -> Status {
296-
return detail::boomphf::build_values(
297-
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements,
319+
return detail::perfect_hash::build_values(
320+
idxer_, reinterpret_cast<const K*>(keys->data()), n_elements,
298321
begin_value, shuffled_values);
299322
});
300323
}
@@ -307,11 +330,27 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
307330
const V begin_value, const size_t n_elements) {
308331
this->set_num_elements_(n_elements);
309332
this->set_ph_keys_(keys);
310-
RETURN_ON_ERROR(detail::boomphf::build_keys(bphf_, keys->GetArray()));
333+
for (auto iter =
334+
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
335+
keys->GetArray()->begin());
336+
iter !=
337+
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
338+
keys->GetArray()->end());
339+
iter++) {
340+
this->builder_.add(*iter);
341+
}
342+
343+
this->builder_.buildPhf();
344+
std::unique_ptr<BlobWriter> writer;
345+
size_t serialize_size = this->builder_.getSerializeSize();
346+
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
347+
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
348+
writer->Seal(client, buf);
349+
311350
return this->allocateValues(
312351
client, n_elements, [&](V* shuffled_values) -> Status {
313-
return detail::boomphf::build_values(bphf_, keys->GetArray(),
314-
begin_value, shuffled_values);
352+
return detail::perfect_hash::build_values(
353+
idxer_, keys->GetArray(), begin_value, shuffled_values);
315354
});
316355
return Status::OK();
317356
}
@@ -323,15 +362,7 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
323362
*
324363
*/
325364
Status Build(Client& client) override {
326-
size_t size = detail::boomphf::bphf_serde::compute_size(bphf_);
327-
std::unique_ptr<BlobWriter> blob_writer;
328-
RETURN_ON_ERROR(client.CreateBlob(size, blob_writer));
329-
char* dst = detail::boomphf::bphf_serde::ser(blob_writer->data(), bphf_);
330-
RETURN_ON_ASSERT(dst == blob_writer->data() + size,
331-
"boomphf serialization error: buffer size mismatched");
332-
std::shared_ptr<Object> blob;
333-
RETURN_ON_ERROR(blob_writer->Seal(client, blob));
334-
this->set_ph_(std::dynamic_pointer_cast<Blob>(blob));
365+
this->set_ph_(buf);
335366
return Status::OK();
336367
}
337368

@@ -359,10 +390,11 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
359390
return Status::OK();
360391
}
361392

362-
boomphf::mphf<K, hasher_t> bphf_;
393+
grape::PTHashIdxerBuilder<K, uint64_t> builder_;
394+
grape::PTHashIdxer<K, uint64_t> idxer_;
395+
std::shared_ptr<Object> buf;
363396

364397
const int concurrency_ = std::thread::hardware_concurrency();
365-
const double gamma_ = 2.5f;
366398
};
367399

368400
} // namespace vineyard

0 commit comments

Comments
 (0)