Skip to content

Commit 8854a1f

Browse files
Implement the Chinese Whispers clustering algorithm in native PHP
To the happiness of many (Issue #690, #688, #687, #685, #649, #632, #627, #625, etc), this means that we do not depend on the pdlib extension, but it goes without saying that its use is still highly recommended. You will understand that it is slower, however I must admit that with JIT enabled, it is quite acceptable, and this is the only reason why decided to publish it. It is still experimental, and it works, but it has problems such as it seems not to converge in stable clusters. When I can fix this, it will probably be even slower.
1 parent 76fe598 commit 8854a1f

File tree

5 files changed

+186
-25
lines changed

5 files changed

+186
-25
lines changed

appinfo/info.xml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
- **🚀 Build your own thing:** FaceRecognition app is just a basic building block. Through FaceRecognition API, you can build your advanced scenarios - automatically add tags to images, connect contacts and persons, share images from specific person… We want to hear your ideas!
1717
]]>
1818
</description>
19-
<version>0.9.20</version>
19+
<version>0.9.30</version>
2020
<licence>agpl</licence>
2121
<author>Matias De lellis</author>
2222
<author>Branko Kokanovic</author>
@@ -34,9 +34,7 @@
3434
<screenshot>https://matiasdelellis.github.io/img/facerecognition/facerecognition-assign-initial-name.jpeg</screenshot>
3535
<dependencies>
3636
<php min-version="8.0" max-version="8.2" />
37-
<lib>pdlib</lib>
38-
<lib>bz2</lib>
39-
<nextcloud min-version="26" max-version="27"/>
37+
<nextcloud min-version="27" max-version="27"/>
4038
</dependencies>
4139
<repair-steps>
4240
<uninstall>

lib/BackgroundJob/Tasks/CheckRequirementsTask.php

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,10 @@ public function execute(FaceRecognitionContext $context) {
9090
$phpMemory = MemoryLimits::getPhpMemory();
9191
$this->logDebug("PHP Memory Limit: " . ($phpMemory > 0 ? $phpMemory : "Unknown"));
9292

93+
$this->logDebug("Clustering backend: " . (Requirements::pdlibLoaded() ? "pdlib" : "PHP (Not recommended."));
94+
9395
if ($this->imaginaryHelper->isEnabled()) {
94-
$this->logDebug("Backend of images: Imaginary");
96+
$this->logDebug("Image Backend: Imaginary");
9597
$version = $this->imaginaryHelper->getVersion();
9698
if ($version) {
9799
$this->logDebug("Imaginary version: " . $version);
@@ -104,15 +106,7 @@ public function execute(FaceRecognitionContext $context) {
104106
return false;
105107
}
106108
} else {
107-
$this->logDebug("Backend of images: Imagick");
108-
}
109-
110-
if (!Requirements::pdlibLoaded()) {
111-
$error_message =
112-
"The PDlib PHP extension is not loaded. Cannot continue without it." .
113-
"Please read the documentation again about how to install the application: https://github.com/matiasdelellis/facerecognition/wiki/Installation";
114-
$this->logInfo($error_message);
115-
return false;
109+
$this->logDebug("Image Backend: Imagick");
116110
}
117111

118112
if (!Requirements::hasEnoughMemory()) {

lib/BackgroundJob/Tasks/CreateClustersTask.php

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<?php
22
/**
3-
* @copyright Copyright (c) 2017-2020 Matias De lellis <[email protected]>
3+
* @copyright Copyright (c) 2017-2023 Matias De lellis <[email protected]>
44
* @copyright Copyright (c) 2018, Branko Kokanovic <[email protected]>
55
*
66
* @author Branko Kokanovic <[email protected]>
@@ -33,6 +33,9 @@
3333
use OCA\FaceRecognition\Db\PersonMapper;
3434

3535
use OCA\FaceRecognition\Helper\Euclidean;
36+
use OCA\FaceRecognition\Helper\Requirements;
37+
38+
use OCA\FaceRecognition\Clusterer\ChineseWhispers;
3639

3740
use OCA\FaceRecognition\Service\SettingsService;
3841
/**
@@ -282,10 +285,9 @@ private function getNewClusters(array $faces): array {
282285
// Clustering parameters
283286
$sensitivity = $this->settingsService->getSensitivity();
284287

285-
// Create edges for chinese whispers
286-
$edges = array();
287-
288-
if (version_compare(phpversion('pdlib'), '1.0.2', '>=')) {
288+
if (!Requirements::pdlibLoaded()) {
289+
// Create edges (neighbors) for Chinese Whispers
290+
$edges = array();
289291
$faces_count = count($faces);
290292
for ($i = 0; $i < $faces_count; $i++) {
291293
$face1 = $faces[$i];
@@ -304,8 +306,14 @@ private function getNewClusters(array $faces): array {
304306
}
305307
}
306308
}
309+
310+
// Given the edges get the list of labels (found clusters) for each face.
311+
$newChineseClustersByIndex = dlib_chinese_whispers($edges);
307312
} else {
313+
// Create edges (neighbors) for Chinese Whispers
314+
$edges = array();
308315
$faces_count = count($faces);
316+
309317
for ($i = 0; $i < $faces_count; $i++) {
310318
$face1 = $faces[$i];
311319
if (!isset($face1->descriptor)) {
@@ -323,17 +331,27 @@ private function getNewClusters(array $faces): array {
323331
}
324332
}
325333
}
334+
335+
// The clustering algorithm actually expects ordered lists.
336+
$oedges = [];
337+
ChineseWhispers::convert_unordered_to_ordered($edges, $oedges);
338+
usort($oedges, function($a, $b) {
339+
if ($a[0] === $b[0]) return $a[1] - $b[1];
340+
return $a[0] - $b[0];
341+
});
342+
343+
// Given the edges get the list of labels (found clusters) for each face.
344+
$newChineseClustersByIndex = [];
345+
ChineseWhispers::predict($oedges, $newChineseClustersByIndex);
326346
}
327347

328-
$newChineseClustersByIndex = dlib_chinese_whispers($edges);
329348
$newClusters = array();
330349
for ($i = 0, $c = count($newChineseClustersByIndex); $i < $c; $i++) {
331350
if (!isset($newClusters[$newChineseClustersByIndex[$i]])) {
332351
$newClusters[$newChineseClustersByIndex[$i]] = array();
333352
}
334353
$newClusters[$newChineseClustersByIndex[$i]][] = $faces[$i]->id;
335354
}
336-
337355
return $newClusters;
338356
}
339357

lib/Clusterer/ChineseWhispers.php

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
<?php
2+
declare(strict_types=1);
3+
/**
4+
* @copyright Copyright (c) 2023, Matias De lellis
5+
*
6+
* @author Matias De lellis <[email protected]>
7+
*
8+
* @license AGPL-3.0-or-later
9+
*
10+
* This code is free software: you can redistribute it and/or modify
11+
* it under the terms of the GNU Affero General Public License, version 3,
12+
* as published by the Free Software Foundation.
13+
*
14+
* This program is distributed in the hope that it will be useful,
15+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17+
* GNU Affero General Public License for more details.
18+
*
19+
* You should have received a copy of the GNU Affero General Public License, version 3,
20+
* along with this program. If not, see <http://www.gnu.org/licenses/>
21+
*
22+
*/
23+
24+
namespace OCA\FaceRecognition\Clusterer;
25+
26+
27+
/**
28+
* This class implements the graph clustering algorithm described in the
29+
* paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its
30+
* Application to Natural Language Processing Problems by Chris Biemann.
31+
*
32+
* In particular, it tries to be a shameless copy of the original dlib
33+
* implementation.
34+
* - https://github.com/davisking/dlib/blob/master/dlib/clustering/chinese_whispers.h
35+
*/
36+
class ChineseWhispers {
37+
38+
/**
39+
* Cluster the dataset by assigning a label to each sample.from the edges
40+
*/
41+
static public function predict(array &$edges, array &$labels, int $num_iterations = 100)
42+
{
43+
$labels = [];
44+
if (count($edges) == 0)
45+
return 0;
46+
47+
$neighbors = [];
48+
self::find_neighbor_ranges($edges, $neighbors);
49+
50+
// Initialize the labels, each node gets a different label.
51+
for ($i = 0; $i < count($neighbors); ++$i)
52+
$labels[$i] = $i;
53+
54+
for ($iter = 0; $iter < count($neighbors)*$num_iterations; ++$iter)
55+
{
56+
// Pick a random node.
57+
$idx = random_int(0, count($neighbors) - 1);
58+
59+
// Count how many times each label happens amongst our neighbors.
60+
$labels_to_counts = [];
61+
$end = $neighbors[$idx][1];
62+
63+
for ($i = $neighbors[$idx][0]; $i != $end; ++$i)
64+
{
65+
$iLabelFirst = $edges[$i][1];
66+
$iLabel = $labels[$iLabelFirst];
67+
if (isset($labels_to_counts[$iLabel]))
68+
$labels_to_counts[$iLabel]++;
69+
else
70+
$labels_to_counts[$iLabel] = 1;
71+
}
72+
73+
// find the most common label
74+
// std::map<unsigned long, double>::iterator i;
75+
$best_score = PHP_INT_MIN;
76+
$best_label = $labels[$idx];
77+
foreach ($labels_to_counts as $key => $value)
78+
{
79+
if ($value > $best_score)
80+
{
81+
$best_score = $value;
82+
$best_label = $key;
83+
}
84+
}
85+
86+
$labels[$idx] = $best_label;
87+
}
88+
89+
// Remap the labels into a contiguous range. First we find the
90+
// mapping.
91+
$label_remap = [];
92+
for ($i = 0; $i < count($labels); ++$i)
93+
{
94+
$next_id = count($label_remap);
95+
if (!isset($label_remap[$labels[$i]]))
96+
$label_remap[$labels[$i]] = $next_id;
97+
}
98+
// now apply the mapping to all the labels.
99+
for ($i = 0; $i < count($labels); ++$i)
100+
{
101+
$labels[$i] = $label_remap[$labels[$i]];
102+
}
103+
104+
return count($label_remap);
105+
}
106+
107+
static function find_neighbor_ranges (&$edges, &$neighbors) {
108+
// setup neighbors so that [neighbors[i].first, neighbors[i].second) is the range
109+
// within edges that contains all node i's edges.
110+
$num_nodes = self::max_index_plus_one($edges);
111+
for ($i = 0; $i < $num_nodes; ++$i) $neighbors[$i] = [0, 0];
112+
$cur_node = 0;
113+
$start_idx = 0;
114+
for ($i = 0; $i < count($edges); ++$i)
115+
{
116+
if ($edges[$i][0] != $cur_node)
117+
{
118+
$neighbors[$cur_node] = [$start_idx, $i];
119+
$start_idx = $i;
120+
$cur_node = $edges[$i][0];
121+
}
122+
}
123+
if (count($neighbors) !== 0)
124+
$neighbors[$cur_node] = [$start_idx, count($edges)];
125+
}
126+
127+
static function max_index_plus_one ($pairs): int {
128+
if (count($pairs) === 0)
129+
{
130+
return 0;
131+
}
132+
else {
133+
$max_idx = 0;
134+
for ($i = 0; $i < count($pairs); ++$i)
135+
{
136+
if ($pairs[$i][0] > $max_idx)
137+
$max_idx = $pairs[$i][0];
138+
if ($pairs[$i][1] > $max_idx)
139+
$max_idx = $pairs[$i][1];
140+
}
141+
return $max_idx + 1;
142+
}
143+
}
144+
145+
static function convert_unordered_to_ordered (&$edges, &$out_edges)
146+
{
147+
$out_edges = [];
148+
for ($i = 0; $i < count($edges); ++$i)
149+
{
150+
$out_edges[] = [$edges[$i][0], $edges[$i][1]];
151+
if ($edges[$i][0] != $edges[$i][1])
152+
$out_edges[] = [$edges[$i][1], $edges[$i][0]];
153+
}
154+
}
155+
}

lib/Model/ExternalModel/ExternalModel.php

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,6 @@ public function isInstalled(): bool {
8787
}
8888

8989
public function meetDependencies(string &$error_message): bool {
90-
if (!extension_loaded('pdlib')) {
91-
$error_message = "The PDlib PHP extension is not loaded.";
92-
return false;
93-
}
9490
if (is_null($this->settingsService->getExternalModelUrl())) {
9591
$error_message = "You still need to configure the URL of the service running the model.";
9692
return false;

0 commit comments

Comments
 (0)