Skip to content

Optimize performance for large-scale parallelism #122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ all-features = true
codspeed-criterion-compat = { workspace = true }
rand = { workspace = true }
wasm-bindgen-test = { workspace = true }
rayon = { workspace = true }

[target.'cfg(unix)'.dev-dependencies]
jemallocator = "0.5.0"
Expand Down Expand Up @@ -59,5 +60,6 @@ ordered-float = "4.0"
phf = "0.11"
phf_codegen = "0.11"
rand = "0.8"
rayon = "1.10"
regex = "1.0"
wasm-bindgen-test = "0.3.0"
20 changes: 20 additions & 0 deletions benches/jieba_benchmark.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use codspeed_criterion_compat::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use jieba_rs::{Jieba, KeywordExtract, TextRank, TfIdf, TokenizeMode};
use lazy_static::lazy_static;
use rayon::iter::{IntoParallelIterator, ParallelIterator};

#[cfg(unix)]
#[global_allocator]
Expand Down Expand Up @@ -58,6 +59,25 @@ fn criterion_benchmark(c: &mut Criterion) {
b.iter(|| TEXTRANK_EXTRACTOR.extract_keywords(&JIEBA, black_box(SENTENCE), 3, Vec::new()))
});
group.finish();

let mut group = c.benchmark_group("multithreaded");
let repeat = 1000usize;
group.throughput(Throughput::Bytes(SENTENCE.len() as u64 * repeat as u64));
group.bench_function("single_thread", |b| {
b.iter(|| {
for _ in 0..repeat {
let _words = JIEBA.cut(black_box(&SENTENCE), true);
}
})
});
group.bench_function("multi_thread", |b| {
b.iter(|| {
(0..repeat).into_par_iter().for_each(|_| {
let _words = JIEBA.cut(black_box(&SENTENCE), true);
});
})
});
group.finish();
}

criterion_group!(benches, criterion_benchmark);
Expand Down
51 changes: 27 additions & 24 deletions src/hmm.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
use std::cmp::Ordering;

use lazy_static::lazy_static;
use regex::Regex;

use crate::SplitMatches;
use jieba_macros::generate_hmm_data;

lazy_static! {
static ref RE_HAN: Regex = Regex::new(r"([\u{4E00}-\u{9FD5}]+)").unwrap();
static ref RE_SKIP: Regex = Regex::new(r"([a-zA-Z0-9]+(?:.\d+)?%?)").unwrap();
thread_local! {
static RE_HAN: Regex = Regex::new(r"([\u{4E00}-\u{9FD5}]+)").unwrap();
static RE_SKIP: Regex = Regex::new(r"([a-zA-Z0-9]+(?:.\d+)?%?)").unwrap();
}

pub const NUM_STATES: usize = 4;
Expand Down Expand Up @@ -190,29 +189,33 @@

#[allow(non_snake_case)]
pub(crate) fn cut_with_allocated_memory<'a>(sentence: &'a str, words: &mut Vec<&'a str>, hmm_context: &mut HmmContext) {
let splitter = SplitMatches::new(&RE_HAN, sentence);
for state in splitter {
let block = state.into_str();
if block.is_empty() {
continue;
}
if RE_HAN.is_match(block) {
if block.chars().count() > 1 {
cut_internal(block, words, hmm_context);
} else {
words.push(block);
}
} else {
let skip_splitter = SplitMatches::new(&RE_SKIP, block);
for skip_state in skip_splitter {
let x = skip_state.into_str();
if x.is_empty() {
RE_HAN.with(|re_han| {
RE_SKIP.with(|re_skip| {
let splitter = SplitMatches::new(re_han, sentence);
for state in splitter {
let block = state.into_str();
if block.is_empty() {
continue;
}
words.push(x);
if re_han.is_match(block) {
if block.chars().count() > 1 {
cut_internal(block, words, hmm_context);
} else {
words.push(block);
}
} else {
let skip_splitter = SplitMatches::new(re_skip, block);
for skip_state in skip_splitter {
let x = skip_state.into_str();
if x.is_empty() {
continue;

Check warning on line 211 in src/hmm.rs

View check run for this annotation

Codecov / codecov/patch

src/hmm.rs#L211

Added line #L211 was not covered by tests
}
words.push(x);
}
}
}
}
}
})
})
}

#[allow(non_snake_case)]
Expand Down
145 changes: 76 additions & 69 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@
//!

use include_flate::flate;
use lazy_static::lazy_static;

use std::cmp::Ordering;
use std::collections::HashMap;
Expand Down Expand Up @@ -102,11 +101,11 @@

use sparse_dag::StaticSparseDAG;

lazy_static! {
static ref RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
static ref RE_SKIP_DEFAULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
static ref RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
static ref RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
thread_local! {
static RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
static RE_SKIP_DEFAULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
static RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
static RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
}

struct SplitMatches<'r, 't> {
Expand Down Expand Up @@ -647,57 +646,63 @@

#[allow(non_snake_case)]
fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<&'a str> {
let heuristic_capacity = sentence.len() / 2;
let mut words = Vec::with_capacity(heuristic_capacity);
let re_han: &Regex = if cut_all { &RE_HAN_CUT_ALL } else { &RE_HAN_DEFAULT };
let re_skip: &Regex = if cut_all { &RE_SKIP_CUT_ALL } else { &RE_SKIP_DEFAULT };
let splitter = SplitMatches::new(re_han, sentence);
let mut route = Vec::with_capacity(heuristic_capacity);
let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);

let mut hmm_context = hmm::HmmContext::new(sentence.chars().count());

for state in splitter {
match state {
SplitState::Matched(_) => {
let block = state.into_str();
assert!(!block.is_empty());

if cut_all {
self.cut_all_internal(block, &mut words);
} else if hmm {
self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context);
} else {
self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
}
}
SplitState::Unmatched(_) => {
let block = state.into_str();
assert!(!block.is_empty());

let skip_splitter = SplitMatches::new(re_skip, block);
for skip_state in skip_splitter {
let word = skip_state.into_str();
if word.is_empty() {
continue;
let re_han = if cut_all { &RE_HAN_CUT_ALL } else { &RE_HAN_DEFAULT };
let re_skip = if cut_all { &RE_SKIP_CUT_ALL } else { &RE_SKIP_DEFAULT };

re_han.with(|re_han| {
re_skip.with(|re_skip| {
let heuristic_capacity = sentence.len() / 2;
let mut words = Vec::with_capacity(heuristic_capacity);

let splitter = SplitMatches::new(re_han, sentence);
let mut route = Vec::with_capacity(heuristic_capacity);
let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);

let mut hmm_context = hmm::HmmContext::new(sentence.chars().count());

for state in splitter {
match state {
SplitState::Matched(_) => {
let block = state.into_str();
assert!(!block.is_empty());

if cut_all {
self.cut_all_internal(block, &mut words);
} else if hmm {
self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context);
} else {
self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
}
}
if cut_all || re_skip.is_match(word) {
words.push(word);
} else {
let mut word_indices = word.char_indices().map(|x| x.0).peekable();
while let Some(byte_start) = word_indices.next() {
if let Some(byte_end) = word_indices.peek() {
words.push(&word[byte_start..*byte_end]);
SplitState::Unmatched(_) => {
let block = state.into_str();
assert!(!block.is_empty());

let skip_splitter = SplitMatches::new(re_skip, block);
for skip_state in skip_splitter {
let word = skip_state.into_str();
if word.is_empty() {
continue;

Check warning on line 685 in src/lib.rs

View check run for this annotation

Codecov / codecov/patch

src/lib.rs#L685

Added line #L685 was not covered by tests
}
if cut_all || re_skip.is_match(word) {
words.push(word);
} else {
words.push(&word[byte_start..]);
let mut word_indices = word.char_indices().map(|x| x.0).peekable();
while let Some(byte_start) = word_indices.next() {
if let Some(byte_end) = word_indices.peek() {
words.push(&word[byte_start..*byte_end]);
} else {
words.push(&word[byte_start..]);
}
}
}
}
}
}
}
}
}
words
words
})
})
}

/// Cut the input text
Expand Down Expand Up @@ -898,32 +903,34 @@

#[test]
fn test_split_matches() {
let re_han = &*RE_HAN_DEFAULT;
let splitter = SplitMatches::new(
re_han,
"👪 PS: 我觉得开源有一个好处,就是能够敦促自己不断改进 👪,避免敞帚自珍",
);
for state in splitter {
match state {
SplitState::Matched(_) => {
let block = state.into_str();
assert!(!block.is_empty());
}
SplitState::Unmatched(_) => {
let block = state.into_str();
assert!(!block.is_empty());
RE_HAN_DEFAULT.with(|re_han| {
let splitter = SplitMatches::new(
re_han,
"👪 PS: 我觉得开源有一个好处,就是能够敦促自己不断改进 👪,避免敞帚自珍",
);
for state in splitter {
match state {
SplitState::Matched(_) => {
let block = state.into_str();
assert!(!block.is_empty());
}
SplitState::Unmatched(_) => {
let block = state.into_str();
assert!(!block.is_empty());
}
}
}
}
});
}

#[test]
fn test_split_matches_against_unicode_sip() {
let re_han = &*RE_HAN_DEFAULT;
let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");
RE_HAN_DEFAULT.with(|re_han| {
let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");

let result: Vec<&str> = splitter.map(|x| x.into_str()).collect();
assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
let result: Vec<&str> = splitter.map(|x| x.into_str()).collect();
assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
});
}

#[test]
Expand Down
Loading