Skip to content

Support Regex function #2059

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion example/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
table_obj.insert(
[{"c1": 'a', "c2": 'a'}, {"c1": 'b', "c2": 'b'}, {"c1": 'c', "c2": 'c'}, {"c1": 'd', "c2": 'd'},
{"c1": 'abc', "c2": 'abc'}, {"c1": 'bbc', "c2": 'bbc'}, {"c1": 'cbc', "c2": 'cbc'}, {"c1": 'dbc', "c2": 'dbc'},
{"c1": 'abcd', "c2": 'abc'}])
{"c1": 'abcd', "c2": 'abc'},
{"c1": '[email protected]', "c2": 'email'}, {"c1": '[email protected]', "c2": 'email'}])

#function char_length
res = table_obj.output(["*"]).filter("char_length(c1) = 1").to_df()
Expand All @@ -34,6 +35,13 @@
res = table_obj.output(["*"]).filter("char_length(c1) = char_length(c2)").to_df()
print(res)

#functin regex
res = table_obj.output(["*"]).filter("regex(c1, 'bc')").to_df()
print(res)

res = table_obj.output(["*"]).filter("regex(c1, '(\w+([-+.]\w+)*)@(\w+([-.]\w+)*)\.(\w+([-.]\w+)*)')").to_df()
print(res)

res = db_obj.drop_table("function_example")

infinity_obj.disconnect()
69 changes: 69 additions & 0 deletions example/http/functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,30 @@ curl --request POST \
"sparse_column": {"20":7.7, "80":7.8, "90": 97.9},
"year": 2018,
"tensor": [[5.0, 4.2, 4.3, 4.5], [4.0, 4.2, 4.3, 4.4]]
},
{
"num": 5,
"body": "[email protected]",
"vec": [4.0, 4.2, 4.3, 4.5],
"sparse_column": {"20":7.7, "80":7.8, "90": 97.9},
"year": 2018,
"tensor": [[5.0, 4.2, 4.3, 4.5], [4.0, 4.2, 4.3, 4.4]]
},
{
"num": 6,
"body": "test@hotmailcom",
"vec": [4.0, 4.2, 4.3, 4.5],
"sparse_column": {"20":7.7, "80":7.8, "90": 97.9},
"year": 2018,
"tensor": [[5.0, 4.2, 4.3, 4.5], [4.0, 4.2, 4.3, 4.4]]
},
{
"num": 7,
"body": "this is a sentence including a mail address, [email protected]",
"vec": [4.0, 4.2, 4.3, 4.5],
"sparse_column": {"20":7.7, "80":7.8, "90": 97.9},
"year": 2018,
"tensor": [[5.0, 4.2, 4.3, 4.5], [4.0, 4.2, 4.3, 4.4]]
}
] '

Expand Down Expand Up @@ -134,6 +158,51 @@ curl --request GET \
"filter": "char_length(body) > 4"
} '

# show rows of 'tbl1' where body inluding 'com'
echo -e '\n\n-- show rows of 'tbl1' where body inluding '\'com\'''
curl --request GET \
--url http://localhost:23820/databases/default_db/tables/tbl1/docs \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--data '
{
"output":
[
"body"
],
"filter": "regex(body, '\'com\'')"
} '

# show rows of 'tbl1' where body including a mail address (using regex)
echo -e '\n\n-- show rows of 'tbl1' where body including a mail address (using regex)'
curl --request GET \
--url http://localhost:23820/databases/default_db/tables/tbl1/docs \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--data '
{
"output":
[
"body"
],
"filter": "regex(body, '\''('.'*'')'@'('.'*'')''\\''.'com\'')"
} '

# show rows of 'tbl1' where body including a mail address (using regex)
echo -e '\n\n-- show rows of 'tbl1' where body including a mail address (using regex)'
curl --request GET \
--url http://localhost:23820/databases/default_db/tables/tbl1/docs \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--data '
{
"output":
[
"body"
],
"filter": "regex(body, '\''('[0-9A-Za-z_]+'('[-+.][0-9A-Za-z_]+')''*'')'@'('[0-9A-Za-z_]+'('[-.][0-9A-Za-z_]+')''*'')''\\'.'('[0-9A-Za-z_]+'('[-.][0-9A-Za-z_]+')''*'')'\'')"
} '

# drop tbl1
echo -e '\n\n-- drop tbl1'
curl --request DELETE \
Expand Down
21 changes: 21 additions & 0 deletions python/test_pysdk/test_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,4 +831,25 @@ def test_select_varchar_length(self, suffix):
.astype({'c1': dtype('O'), 'c2': dtype('O')}))

res = db_obj.drop_table("test_select_varchar_length"+suffix)
assert res.error_code == ErrorCode.OK

def test_select_regex(self, suffix):
db_obj = self.infinity_obj.get_database("default_db")
db_obj.drop_table("test_select_regex"+suffix, ConflictType.Ignore)
db_obj.create_table("test_select_regex"+suffix,
{"c1": {"type": "varchar", "constraints": ["primary key", "not null"]},
"c2": {"type": "varchar", "constraints": ["not null"]}}, ConflictType.Error)
table_obj = db_obj.get_table("test_select_regex"+suffix)
table_obj.insert(
[{"c1": 'a', "c2": 'a'}, {"c1": 'b', "c2": 'b'}, {"c1": 'c', "c2": 'c'}, {"c1": 'd', "c2": 'd'},
{"c1": 'abc', "c2": 'abc'}, {"c1": 'bbc', "c2": 'bbc'}, {"c1": 'cbc', "c2": 'cbc'}, {"c1": 'dbc', "c2": 'dbc'},])

res = table_obj.output(["*"]).filter("regex(c1, 'bc')").to_df()
print(res)
pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('abc', 'bbc', 'cbc', 'dbc'),
'c2': ('abc', 'bbc', 'cbc', 'dbc')})
.astype({'c1': dtype('O'), 'c2': dtype('O')}))


res = db_obj.drop_table("test_select_regex"+suffix)
assert res.error_code == ErrorCode.OK
2 changes: 2 additions & 0 deletions src/function/builtin_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import substring;
import substract;
import char_length;
import md5;
import regex;
import default_values;
import special_function;
import internal_types;
Expand Down Expand Up @@ -117,6 +118,7 @@ void BuiltinFunctions::RegisterScalarFunction() {
RegisterSubstringFunction(catalog_ptr_);
RegisterCharLengthFunction(catalog_ptr_);
RegisterMd5Function(catalog_ptr_);
RegisterRegexFunction(catalog_ptr_);
}

void BuiltinFunctions::RegisterTableFunction() {}
Expand Down
54 changes: 54 additions & 0 deletions src/function/scalar/regex.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
module;

#include <re2/re2.h>

module regex;

import stl;
import catalog;
import status;
import infinity_exception;
import scalar_function;
import scalar_function_set;

import third_party;
import logical_type;
import internal_types;
import data_type;
import logger;
import column_vector;

namespace infinity {

struct RegexFunction {
template <typename TA, typename TB, typename TC>
static inline void Run(TA &left, TB &right, TC &result) {
const char * origin_str;
SizeT origin_len;
const char * pattern_str;
SizeT pattern_len;
GetReaderValue(left, origin_str, origin_len);
GetReaderValue(right, pattern_str, pattern_len);
String origin_str_(origin_str, origin_len);
String pattern_str_(pattern_str, pattern_len);
bool match = re2::RE2::PartialMatch(origin_str_, pattern_str_);
result.SetValue(match);
}
};


void RegisterRegexFunction(const UniquePtr<Catalog> &catalog_ptr){
String func_name = "regex";

SharedPtr<ScalarFunctionSet> function_set_ptr = MakeShared<ScalarFunctionSet>(func_name);

ScalarFunction Regex_function(func_name,
{DataType(LogicalType::kVarchar), DataType(LogicalType::kVarchar)},
DataType(LogicalType::kBoolean),
&ScalarFunction::BinaryFunction<VarcharT, VarcharT, BooleanT, RegexFunction>);
function_set_ptr->AddFunction(Regex_function);

Catalog::AddFunctionSet(catalog_ptr.get(), function_set_ptr);
}

}
13 changes: 13 additions & 0 deletions src/function/scalar/regex.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module;

import stl;

export module regex;

namespace infinity {

class Catalog;

export void RegisterRegexFunction(const UniquePtr<Catalog> &catalog_ptr);

}
19 changes: 19 additions & 0 deletions test/sql/dql/type/varchar.slt
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,24 @@ SELECT * FROM test_varchar_filter where md5(c1) = md5('abcdddde');
abcdddde abcddddd 3
abcdddde abcdddde 4

statement ok
INSERT INTO test_varchar_filter VALUES ('[email protected]', '[email protected]', 6);

query X
SELECT * FROM test_varchar_filter where regex(c1, 'abc\w+e');
----
abcdddde abcddddd 3
abcdddde abcdddde 4

query XI
SELECT * FROM test_varchar_filter where regex(c1, 'ddddc');
----
abcddddc abcddddd 2

query XII
SELECT * FROM test_varchar_filter where regex(c1, '(\w+([-+.]\w+)*)@(\w+([-.]\w+)*)\.(\w+([-.]\w+)*)');
----
[email protected] [email protected] 6

statement ok
DROP TABLE test_varchar_filter;
Loading