feat: array_to_sentence_string and number_of_words filters from Jekyll, #443

harttle · harttle · commit 50253a98caf5 · 2024-05-13T00:38:52.000+08:00
diff --git a/docs/source/_data/sidebar.yml b/docs/source/_data/sidebar.yml
@@ -29,6 +29,7 @@ filters:
   overview: overview.html
   abs: abs.html
   append: append.html
+  array_to_sentence_string: array_to_sentence_string.html
   at_least: at_least.html
   at_most: at_most.html
   capitalize: capitalize.html
@@ -63,6 +64,7 @@ filters:
   modulo: modulo.html
   newline_to_br: newline_to_br.html
   normalize_whitespace: normalize_whitespace.html
+  number_of_words: number_of_words.html
   plus: plus.html
   pop: pop.html
   push: push.html
diff --git a/docs/source/filters/array_to_sentence_string.md b/docs/source/filters/array_to_sentence_string.md
@@ -0,0 +1,27 @@
+---
+title: array_to_sentence_string
+---
+
+{% since %}v10.13.0{% endsince %}
+
+Convert an array into a sentence. Useful for listing tags. Optional argument for connector.
+
+Input
+```liquid
+{{ "foo,bar,baz" | split: "," | array_to_sentence_string }}
+```
+
+Output
+```text
+foo, bar, and baz
+```
+
+Input
+```liquid
+{{ "foo,bar,baz" | split: "," | array_to_sentence_string: "or" }}
+```
+
+Output
+```text
+foo, bar, or baz
+```
diff --git a/docs/source/filters/number_of_words.md b/docs/source/filters/number_of_words.md
@@ -0,0 +1,49 @@
+---
+title: number_of_words
+---
+
+{% since %}v10.13.0{% endsince %}
+
+Count the number of words in some text. This filter takes an optional argument to control the handling of Chinese-Japanese-Korean (CJK) characters in the input string:
+- Passing `'cjk'` as the argument will count every CJK character detected as one word irrespective of being separated by whitespace.
+- Passing `'auto'` (auto-detect) works similar to `'cjk'` but is more performant if the filter is used on a variable string that may or may not contain CJK chars.
+
+Input
+```liquid
+{{ "Hello world!" | number_of_words }}
+```
+
+Output
+```text
+2
+```
+
+Input
+```liquid
+{{ "你好hello世界world" | number_of_words }}
+```
+
+Output
+```text
+1
+```
+
+Input
+```liquid
+{{ "你好hello世界world" | number_of_words: "cjk" }}
+```
+
+Output
+```text
+6
+```
+
+Input
+```liquid
+{{ "你好hello世界world" | number_of_words: "auto" }}
+```
+
+Output
+```text
+6
+```
diff --git a/docs/source/filters/overview.md b/docs/source/filters/overview.md
@@ -10,7 +10,7 @@ There's 40+ filters supported by LiquidJS. These filters can be categorized into
 Categories | Filters
 --- | ---
 Math | plus, minus, modulo, times, floor, ceil, round, divided_by, abs, at_least, at_most
-String | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last,remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace
+String | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last,remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace, number_of_words, array_to_sentence_string
 HTML/URI | escape, escape_once, url_encode, url_decode, strip_html, newline_to_br, xml_escape, cgi_escape, uri_escape
 Array | slice, map, sort, sort_natural, uniq, where, where_exp, group_by, group_by_exp, find, find_exp, first, last, join, reverse, concat, compact, size, push, pop, shift, unshift
 Date | date, date_to_xmlschema, date_to_rfc822, date_to_string, date_to_long_string
diff --git a/docs/source/zh-cn/filters/array_to_sentence_string.md b/docs/source/zh-cn/filters/array_to_sentence_string.md
@@ -0,0 +1,27 @@
+---
+title: array_to_sentence_string
+---
+
+{% since %}v10.13.0{% endsince %}
+
+把数组转化为句子，用于做标签列表。有一个可选的连接词参数。
+
+输入
+```liquid
+{{ "foo,bar,baz" | split: "," | array_to_sentence_string }}
+```
+
+输出
+```text
+foo, bar, and baz
+```
+
+输入
+```liquid
+{{ "foo,bar,baz" | split: "," | array_to_sentence_string: "or" }}
+```
+
+输出
+```text
+foo, bar, or baz
+```
diff --git a/docs/source/zh-cn/filters/number_of_words.md b/docs/source/zh-cn/filters/number_of_words.md
@@ -0,0 +1,49 @@
+---
+title: number_of_words
+---
+
+{% since %}v10.13.0{% endsince %}
+
+计算文本中的单词数。此过滤器接受一个可选参数，用于控制输入字符串中汉字-日语-韩语（CJK）字符的处理方式：
+- `'cjk'`：将每个检测到的 CJK 字符计为一个单词，无论是否由空格分隔。
+- `'auto'`：与 `'cjk'` 类似，但如果过滤器用于可能包含或不包含 CJK 字符的字符串，则性能更好。
+
+输入
+```liquid
+{{ "Hello world!" | number_of_words }}
+```
+
+输出
+```text
+2
+```
+
+输入
+```liquid
+{{ "你好hello世界world" | number_of_words }}
+```
+
+输出
+```text
+1
+```
+
+输入
+```liquid
+{{ "你好hello世界world" | number_of_words: "cjk" }}
+```
+
+输出
+```text
+6
+```
+
+输入
+```liquid
+{{ "你好hello世界world" | number_of_words: "auto" }}
+```
+
+输出
+```text
+6
+```
diff --git a/docs/source/zh-cn/filters/overview.md b/docs/source/zh-cn/filters/overview.md
@@ -10,7 +10,7 @@ LiquidJS 共支持 40+ 个过滤器，可以分为如下几类：
 类别 | 过滤器
 --- | ---
 数学 | plus, minus, modulo, times, floor, ceil, round, divided_by, abs, at_least, at_most
-字符串 | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last, remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace
+字符串 | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last, remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace, number_of_words, array_to_sentence_string
 HTML/URI | escape, escape_once, url_encode, url_decode, strip_html, newline_to_br, xml_escape, cgi_escape, uri_escape
 数组 | slice, map, sort, sort_natural, uniq, where, where_exp, group_by, group_by_exp, find, find_exp, first, last, join, reverse, concat, compact, size, push, pop, shift, unshift
 日期 | date, date_to_xmlschema, date_to_rfc822, date_to_string, date_to_long_string
diff --git a/src/filters/string.ts b/src/filters/string.ts
@@ -3,8 +3,20 @@
  *
  * * prefer stringify() to String() since `undefined`, `null` should eval ''
  */
+
+// Han (Chinese) characters: \u4E00-\u9FFF
+// Additional Han characters: \uF900-\uFAFF (CJK Compatibility Ideographs)
+// Additional Han characters: \u3400-\u4DBF (CJK Unified Ideographs Extension A)
+// Katakana (Japanese): \u30A0-\u30FF
+// Hiragana (Japanese): \u3040-\u309F
+// Hangul (Korean): \uAC00-\uD7AF
 import { assert, escapeRegExp, stringify } from '../util'
 
+const rCJKWord = /[\u4E00-\u9FFF\uF900-\uFAFF\u3400-\u4DBF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF]/gu
+
+// Word boundary followed by word characters (for detecting words)
+const rNonCJKWord = /[^\u4E00-\u9FFF\uF900-\uFAFF\u3400-\u4DBF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF\s]+/gu
+
 export function append (v: string, arg: string) {
   assert(arguments.length === 2, 'append expect 2 arguments')
   return stringify(v) + stringify(arg)
@@ -32,16 +44,16 @@ export function upcase (str: string) {
 }
 
 export function remove (v: string, arg: string) {
-  return stringify(v).split(String(arg)).join('')
+  return stringify(v).split(stringify(arg)).join('')
 }
 
 export function remove_first (v: string, l: string) {
-  return stringify(v).replace(String(l), '')
+  return stringify(v).replace(stringify(l), '')
 }
 
 export function remove_last (v: string, l: string) {
   const str = stringify(v)
-  const pattern = String(l)
+  const pattern = stringify(l)
   const index = str.lastIndexOf(pattern)
   if (index === -1) return str
   return str.substring(0, index) + str.substring(index + pattern.length)
@@ -56,7 +68,7 @@ export function rstrip (str: string, chars?: string) {
 }
 
 export function split (v: string, arg: string) {
-  const arr = stringify(v).split(String(arg))
+  const arr = stringify(v).split(stringify(arg))
   // align to ruby split, which is the behavior of shopify/liquid
   // see: https://ruby-doc.org/core-2.4.0/String.html#method-i-split
   while (arr.length && arr[arr.length - 1] === '') arr.pop()
@@ -83,19 +95,19 @@ export function capitalize (str: string) {
 }
 
 export function replace (v: string, pattern: string, replacement: string) {
-  return stringify(v).split(String(pattern)).join(replacement)
+  return stringify(v).split(stringify(pattern)).join(replacement)
 }
 
 export function replace_first (v: string, arg1: string, arg2: string) {
-  return stringify(v).replace(String(arg1), arg2)
+  return stringify(v).replace(stringify(arg1), arg2)
 }
 
 export function replace_last (v: string, arg1: string, arg2: string) {
   const str = stringify(v)
-  const pattern = String(arg1)
+  const pattern = stringify(arg1)
   const index = str.lastIndexOf(pattern)
   if (index === -1) return str
-  const replacement = String(arg2)
+  const replacement = stringify(arg2)
   return str.substring(0, index) + replacement + str.substring(index + pattern.length)
 }
 
@@ -117,3 +129,34 @@ export function normalize_whitespace (v: string) {
   v = stringify(v)
   return v.replace(/\s+/g, ' ')
 }
+
+export function number_of_words (input: string, mode?: 'cjk' | 'auto') {
+  input = stringify(input).trim()
+  if (!input) return 0
+  switch (mode) {
+    case 'cjk':
+      // Count CJK characters and words
+      return (input.match(rCJKWord) || []).length + (input.match(rNonCJKWord) || []).length
+    case 'auto':
+      // Count CJK characters, if none, count words
+      return rCJKWord.test(input)
+        ? input.match(rCJKWord)!.length + (input.match(rNonCJKWord) || []).length
+        : input.split(/\s+/).length
+    default:
+      // Count words only
+      return input.split(/\s+/).length
+  }
+}
+
+export function array_to_sentence_string (array: unknown[], connector = 'and') {
+  switch (array.length) {
+    case 0:
+      return ''
+    case 1:
+      return array[0]
+    case 2:
+      return `${array[0]} ${connector} ${array[1]}`
+    default:
+      return `${array.slice(0, -1).join(', ')}, ${connector} ${array[array.length - 1]}`
+  }
+}
diff --git a/test/integration/filters/string.spec.ts b/test/integration/filters/string.spec.ts
@@ -238,4 +238,100 @@ describe('filters/string', function () {
       expect(liquid.parseAndRenderSync('{{ "a \n b  c" | normalize_whitespace }}')).toEqual('a b c')
     })
   })
+  describe('number_of_words', () => {
+    it('should count words of Latin sentence', async () => {
+      const html = await liquid.parseAndRender('{{ "I\'m not hungry" | number_of_words: "auto"}}')
+      expect(html).toEqual('3')
+    })
+
+    it('should count words of mixed sentence', async () => {
+      const html = await liquid.parseAndRender('{{ "Hello world!" | number_of_words }}')
+      expect(html).toEqual('2')
+    })
+
+    it('should count words of CJK sentence', async () => {
+      const html = await liquid.parseAndRender('{{ "你好hello世界world" | number_of_words }}')
+      expect(html).toEqual('1')
+    })
+
+    it('should count words of CJK sentence with mode "cjk"', async () => {
+      const html = await liquid.parseAndRender('{{ "你好hello世界world" | number_of_words: "cjk" }}')
+      expect(html).toEqual('6')
+    })
+
+    it('should count words of CJK sentence with mode "auto"', async () => {
+      const html = await liquid.parseAndRender('{{ "你好hello世界world" | number_of_words: "auto" }}')
+      expect(html).toEqual('6')
+    })
+    it('should handle empty input', async () => {
+      const html = await liquid.parseAndRender('{{ "" | number_of_words }}')
+      expect(html).toEqual('0')
+    })
+
+    it('should handle input with only whitespace', async () => {
+      const html = await liquid.parseAndRender('{{ "   " | number_of_words }}')
+      expect(html).toEqual('0')
+    })
+
+    it('should count words with punctuation marks', async () => {
+      const html = await liquid.parseAndRender('{{ "Hello! This is a test." | number_of_words }}')
+      expect(html).toEqual('5')
+    })
+
+    it('should count words with special characters', async () => {
+      const html = await liquid.parseAndRender('{{ "This is a test with special characters: !@#$%^&*()-_+=`~[]{};:\'\\"\\|<,>.?/" | number_of_words }}')
+      expect(html).toEqual('8')
+    })
+
+    it('should count words with multiple spaces between words', async () => {
+      const html = await liquid.parseAndRender('{{ "   Hello    world!    " | number_of_words }}')
+      expect(html).toEqual('2')
+    })
+
+    it('should count words with mixed CJK characters', async () => {
+      const html = await liquid.parseAndRender('{{ "你好こんにちは안녕하세요" | number_of_words: "cjk" }}')
+      expect(html).toEqual('12')
+    })
+  })
+  describe('array_to_sentence_string', () => {
+    it('should handle an empty array', async () => {
+      const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: [] })
+      expect(html).toEqual('')
+    })
+
+    it('should handle an array with one element', async () => {
+      const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ['apple'] })
+      expect(html).toEqual('apple')
+    })
+
+    it('should handle an array with two elements', async () => {
+      const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ['apple', 'banana'] })
+      expect(html).toEqual('apple and banana')
+    })
+
+    it('should handle an array with more than two elements', async () => {
+      const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ['apple', 'banana', 'orange'] })
+      expect(html).toEqual('apple, banana, and orange')
+    })
+
+    it('should handle an array with custom connector', async () => {
+      const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string: "or" }}', { arr: ['apple', 'banana', 'orange'] })
+      expect(html).toEqual('apple, banana, or orange')
+    })
+
+    it('should handle an array of numbers', async () => {
+      const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: [1, 2, 3] })
+      expect(html).toEqual('1, 2, and 3')
+    })
+
+    it('should handle an array of mixed types', async () => {
+      const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ['apple', 2, 'orange'] })
+      expect(html).toEqual('apple, 2, and orange')
+    })
+
+    it('should handle an array of mixed types', async () => {
+      const html = await liquid.parseAndRender('{{ "foo,bar,baz" | split: "," | array_to_sentence_string }}')
+      expect(html).toEqual('foo, bar, and baz')
+    })
+  })
 })