messense · messense · May 26, 2025 · May 23, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,6 +17,7 @@ all-features = true
 codspeed-criterion-compat = { workspace = true }
 rand = { workspace = true }
 wasm-bindgen-test = { workspace = true }
+rayon = { workspace = true }
 
 [target.'cfg(unix)'.dev-dependencies]
 jemallocator = "0.5.0"
@@ -59,5 +60,6 @@ ordered-float = "4.0"
 phf = "0.11"
 phf_codegen = "0.11"
 rand = "0.8"
+rayon = "1.10"
 regex = "1.0"
 wasm-bindgen-test = "0.3.0"
diff --git a/benches/jieba_benchmark.rs b/benches/jieba_benchmark.rs
@@ -1,6 +1,7 @@
 use codspeed_criterion_compat::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 use jieba_rs::{Jieba, KeywordExtract, TextRank, TfIdf, TokenizeMode};
 use lazy_static::lazy_static;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
 
 #[cfg(unix)]
 #[global_allocator]
@@ -58,6 +59,25 @@ fn criterion_benchmark(c: &mut Criterion) {
         b.iter(|| TEXTRANK_EXTRACTOR.extract_keywords(&JIEBA, black_box(SENTENCE), 3, Vec::new()))
     });
     group.finish();
+
+    let mut group = c.benchmark_group("multithreaded");
+    let repeat = 1000usize;
+    group.throughput(Throughput::Bytes(SENTENCE.len() as u64 * repeat as u64));
+    group.bench_function("single_thread", |b| {
+        b.iter(|| {
+            for _ in 0..repeat {
+                let _words = JIEBA.cut(black_box(&SENTENCE), true);
+            }
+        })
+    });
+    group.bench_function("multi_thread", |b| {
+        b.iter(|| {
+            (0..repeat).into_par_iter().for_each(|_| {
+                let _words = JIEBA.cut(black_box(&SENTENCE), true);
+            });
+        })
+    });
+    group.finish();
 }
 
 criterion_group!(benches, criterion_benchmark);

diff --git a/src/hmm.rs b/src/hmm.rs
@@ -1,14 +1,13 @@
 use std::cmp::Ordering;
 
-use lazy_static::lazy_static;
 use regex::Regex;
 
 use crate::SplitMatches;
 use jieba_macros::generate_hmm_data;
 
-lazy_static! {
-    static ref RE_HAN: Regex = Regex::new(r"([\u{4E00}-\u{9FD5}]+)").unwrap();
-    static ref RE_SKIP: Regex = Regex::new(r"([a-zA-Z0-9]+(?:.\d+)?%?)").unwrap();
+thread_local! {
+    static RE_HAN: Regex = Regex::new(r"([\u{4E00}-\u{9FD5}]+)").unwrap();
+    static RE_SKIP: Regex = Regex::new(r"([a-zA-Z0-9]+(?:.\d+)?%?)").unwrap();
 }
 
 pub const NUM_STATES: usize = 4;
@@ -190,29 +189,33 @@
 
 #[allow(non_snake_case)]
 pub(crate) fn cut_with_allocated_memory<'a>(sentence: &'a str, words: &mut Vec<&'a str>, hmm_context: &mut HmmContext) {
-    let splitter = SplitMatches::new(&RE_HAN, sentence);
-    for state in splitter {
-        let block = state.into_str();
-        if block.is_empty() {
-            continue;
-        }
-        if RE_HAN.is_match(block) {
-            if block.chars().count() > 1 {
-                cut_internal(block, words, hmm_context);
-            } else {
-                words.push(block);
-            }
-        } else {
-            let skip_splitter = SplitMatches::new(&RE_SKIP, block);
-            for skip_state in skip_splitter {
-                let x = skip_state.into_str();
-                if x.is_empty() {
+    RE_HAN.with(|re_han| {
+        RE_SKIP.with(|re_skip| {
+            let splitter = SplitMatches::new(re_han, sentence);
+            for state in splitter {
+                let block = state.into_str();
+                if block.is_empty() {
                     continue;
                 }
-                words.push(x);
+                if re_han.is_match(block) {
+                    if block.chars().count() > 1 {
+                        cut_internal(block, words, hmm_context);
+                    } else {
+                        words.push(block);
+                    }
+                } else {
+                    let skip_splitter = SplitMatches::new(re_skip, block);
+                    for skip_state in skip_splitter {
+                        let x = skip_state.into_str();
+                        if x.is_empty() {
+                            continue;
+                        }
+                        words.push(x);
+                    }
+                }
             }
-        }
-    }
+        })
+    })
 }
 
 #[allow(non_snake_case)]

diff --git a/src/lib.rs b/src/lib.rs
@@ -72,7 +72,6 @@
 //!
 
 use include_flate::flate;
-use lazy_static::lazy_static;
 
 use std::cmp::Ordering;
 use std::collections::HashMap;
@@ -102,11 +101,11 @@
 
 use sparse_dag::StaticSparseDAG;
 
-lazy_static! {
-    static ref RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
-    static ref RE_SKIP_DEFAULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
-    static ref RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
-    static ref RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
+thread_local! {
+    static RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
+    static RE_SKIP_DEFAULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
+    static RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
+    static RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
 }
 
 struct SplitMatches<'r, 't> {
@@ -647,57 +646,63 @@
 
     #[allow(non_snake_case)]
     fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<&'a str> {
-        let heuristic_capacity = sentence.len() / 2;
-        let mut words = Vec::with_capacity(heuristic_capacity);
-        let re_han: &Regex = if cut_all { &RE_HAN_CUT_ALL } else { &RE_HAN_DEFAULT };
-        let re_skip: &Regex = if cut_all { &RE_SKIP_CUT_ALL } else { &RE_SKIP_DEFAULT };
-        let splitter = SplitMatches::new(re_han, sentence);
-        let mut route = Vec::with_capacity(heuristic_capacity);
-        let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
-
-        let mut hmm_context = hmm::HmmContext::new(sentence.chars().count());
-
-        for state in splitter {
-            match state {
-                SplitState::Matched(_) => {
-                    let block = state.into_str();
-                    assert!(!block.is_empty());
-
-                    if cut_all {
-                        self.cut_all_internal(block, &mut words);
-                    } else if hmm {
-                        self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context);
-                    } else {
-                        self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
-                    }
-                }
-                SplitState::Unmatched(_) => {
-                    let block = state.into_str();
-                    assert!(!block.is_empty());
-
-                    let skip_splitter = SplitMatches::new(re_skip, block);
-                    for skip_state in skip_splitter {
-                        let word = skip_state.into_str();
-                        if word.is_empty() {
-                            continue;
+        let re_han = if cut_all { &RE_HAN_CUT_ALL } else { &RE_HAN_DEFAULT };
+        let re_skip = if cut_all { &RE_SKIP_CUT_ALL } else { &RE_SKIP_DEFAULT };
+
+        re_han.with(|re_han| {
+            re_skip.with(|re_skip| {
+                let heuristic_capacity = sentence.len() / 2;
+                let mut words = Vec::with_capacity(heuristic_capacity);
+
+                let splitter = SplitMatches::new(re_han, sentence);
+                let mut route = Vec::with_capacity(heuristic_capacity);
+                let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
+
+                let mut hmm_context = hmm::HmmContext::new(sentence.chars().count());
+
+                for state in splitter {
+                    match state {
+                        SplitState::Matched(_) => {
+                            let block = state.into_str();
+                            assert!(!block.is_empty());
+
+                            if cut_all {
+                                self.cut_all_internal(block, &mut words);
+                            } else if hmm {
+                                self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context);
+                            } else {
+                                self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
+                            }
                         }
-                        if cut_all || re_skip.is_match(word) {
-                            words.push(word);
-                        } else {
-                            let mut word_indices = word.char_indices().map(|x| x.0).peekable();
-                            while let Some(byte_start) = word_indices.next() {
-                                if let Some(byte_end) = word_indices.peek() {
-                                    words.push(&word[byte_start..*byte_end]);
+                        SplitState::Unmatched(_) => {
+                            let block = state.into_str();
+                            assert!(!block.is_empty());
+
+                            let skip_splitter = SplitMatches::new(re_skip, block);
+                            for skip_state in skip_splitter {
+                                let word = skip_state.into_str();
+                                if word.is_empty() {
+                                    continue;
+                                }
+                                if cut_all || re_skip.is_match(word) {
+                                    words.push(word);
                                 } else {
-                                    words.push(&word[byte_start..]);
+                                    let mut word_indices = word.char_indices().map(|x| x.0).peekable();
+                                    while let Some(byte_start) = word_indices.next() {
+                                        if let Some(byte_end) = word_indices.peek() {
+                                            words.push(&word[byte_start..*byte_end]);
+                                        } else {
+                                            words.push(&word[byte_start..]);
+                                        }
+                                    }
                                 }
                             }
                         }
                     }
                 }
-            }
-        }
-        words
+                words
+            })
+        })
     }
 
     /// Cut the input text
@@ -898,32 +903,34 @@
 
     #[test]
     fn test_split_matches() {
-        let re_han = &*RE_HAN_DEFAULT;
-        let splitter = SplitMatches::new(
-            re_han,
-            "👪 PS: 我觉得开源有一个好处，就是能够敦促自己不断改进 👪，避免敞帚自珍",
-        );
-        for state in splitter {
-            match state {
-                SplitState::Matched(_) => {
-                    let block = state.into_str();
-                    assert!(!block.is_empty());
-                }
-                SplitState::Unmatched(_) => {
-                    let block = state.into_str();
-                    assert!(!block.is_empty());
+        RE_HAN_DEFAULT.with(|re_han| {
+            let splitter = SplitMatches::new(
+                re_han,
+                "👪 PS: 我觉得开源有一个好处，就是能够敦促自己不断改进 👪，避免敞帚自珍",
+            );
+            for state in splitter {
+                match state {
+                    SplitState::Matched(_) => {
+                        let block = state.into_str();
+                        assert!(!block.is_empty());
+                    }
+                    SplitState::Unmatched(_) => {
+                        let block = state.into_str();
+                        assert!(!block.is_empty());
+                    }
                 }
             }
-        }
+        });
     }
 
     #[test]
     fn test_split_matches_against_unicode_sip() {
-        let re_han = &*RE_HAN_DEFAULT;
-        let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");
+        RE_HAN_DEFAULT.with(|re_han| {
+            let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");
 
-        let result: Vec<&str> = splitter.map(|x| x.into_str()).collect();
-        assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
+            let result: Vec<&str> = splitter.map(|x| x.into_str()).collect();
+            assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
+        });
     }
 
     #[test]