Skip to content

Commit fa40c6d

Browse files
authored
Merge pull request #2 from ties/feature/more-schemes-supported
Support more schemes
2 parents 30d7602 + 569f27c commit fa40c6d

12 files changed

+70
-12
lines changed

src/functions/extract_domain.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ namespace duckdb
3535
Connection con (db);
3636

3737
// Extract the host from the URL
38-
std::regex host_regex (R"(^(?:https?:\/\/)?([^\/\?:]+))");
38+
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))");
3939
std::smatch host_match;
4040
if (!std::regex_search (input, host_match, host_regex))
4141
{

src/functions/extract_host.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,13 @@ namespace duckdb
2424
{
2525
// Regex to match the host component of a URL
2626
// Explanation:
27-
// ^ - Start of the string
28-
// (?: - Non-capturing group for the optional protocol
29-
// https?:\/\/ - Matches "http://" or "https://"
27+
// ^ - Start of the string
28+
// (?: - Non-capturing group for the optional protocol
29+
// (?:ftp|https?|rsync) - Non-capturing group for the scheme
30+
// :\/\/ - Matches "://"
3031
// )?
31-
// ([^\/\s:?#]+) - Capturing group for the host (any characters except '/', ':', '?', '#', or whitespace)
32-
std::regex host_regex (R"(^(?:https?:\/\/)?([^\/\s:?#]+))");
32+
// ([^\/\s:?#]+) - Capturing group for the host (any characters except '/', ':', '?', '#', or whitespace)
33+
std::regex host_regex (R"(^(?:(?:ftp|https?|rsync):\/\/)?([^\/\s:?#]+))");
3334
std::smatch host_match;
3435

3536
// Use regex_search to find the host component in the input string

src/functions/extract_path.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ namespace duckdb
2626
// Explanation:
2727
// ^ - Start of the string
2828
// (?: - Non-capturing group for the protocol and domain part
29-
// (?:https?:\/\/)? - Optional http:// or https://
29+
// (?:(?:ftp|https?|rsync):\/\/)? - Optional ftp://, http://, https://, or rsync://
3030
// (?:[^\/\s]+) - Domain name (any characters except '/' or whitespace)
3131
// )
3232
// (\/[^?#]*) - Capturing group for the path (starts with '/', followed by any characters except '?' or '#')
33-
std::regex path_regex (R"(^(?:(?:https?:\/\/)?(?:[^\/\s]+))(\/[^?#]*))");
33+
std::regex path_regex (R"(^(?:(?:(?:ftp|https?|rsync):\/\/)?(?:[^\/\s]+))(\/[^?#]*))");
3434
std::smatch path_match;
3535

3636
// Use regex_search to find the path component in the input string

src/functions/extract_schema.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ namespace duckdb
3030
// | - OR
3131
// (mailto|sms|tel) - Capturing group for other protocols (mailto, sms, tel)
3232
// :[^/] - Matches ":" followed by any character except "/"
33-
std::regex schema_regex (R"(^(http|https|ftp):\/\/|^(mailto|sms|tel):[^/])");
33+
std::regex schema_regex (R"(^(http|https|ftp|rsync):\/\/|^(mailto|sms|tel):[^/])");
3434
std::smatch schema_match;
3535

3636
// Use regex_search to find the schema component in the input string

src/functions/extract_subdomain.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ namespace duckdb
2929
Connection con (db);
3030

3131
// Extract the host from the URL
32-
std::regex host_regex (R"(^(?:https?:\/\/)?([^\/\?:]+))");
32+
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))");
3333
std::smatch host_match;
3434
if (!std::regex_search (input, host_match, host_regex))
3535
{

src/functions/extract_tld.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ namespace duckdb
3030
Connection con (db);
3131

3232
// Extract the host from the URL
33-
std::regex host_regex (R"(^(?:https?:\/\/)?([^\/\?:]+))");
33+
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))");
3434
std::smatch host_match;
3535
if (!std::regex_search (input, host_match, host_regex))
3636
{

test/sql/extract_domain.test

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,16 @@ SELECT extract_domain('com:443');
144144
----
145145
com
146146

147+
query I
148+
SELECT extract_domain('ftp://example.com/path');
149+
----
150+
example.com
151+
152+
query I
153+
SELECT extract_domain('rsync://rpki.example.com/path');
154+
----
155+
example.com
156+
147157
query I
148158
SELECT extract_domain('c');
149159
----
@@ -162,4 +172,4 @@ SELECT extract_domain('http:/example.com.ac/path');
162172
query I
163173
SELECT extract_domain('http:/example.com.ac:443/path');
164174
----
165-
(empty)
175+
(empty)

test/sql/extract_host.test

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,13 @@ query I
5353
SELECT extract_host('example.com.ac:443/path/path');
5454
----
5555
example.com.ac
56+
57+
query I
58+
SELECT extract_host('ftp://ftp.example.com/path');
59+
----
60+
ftp.example.com
61+
62+
query I
63+
SELECT extract_host('rsync://rpki.example.com/path');
64+
----
65+
rpki.example.com

test/sql/extract_path.test

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,13 @@ query I
5353
SELECT extract_path('example.com.ac:443/path/path');
5454
----
5555
/path/path
56+
57+
query I
58+
SELECT extract_path('ftp://ftp.exmple.com/path/path');
59+
----
60+
/path/path
61+
62+
query I
63+
SELECT extract_path('rsync://rpki.exmple.com/path/path');
64+
----
65+
/path/path

test/sql/extract_schema.test

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,13 @@ SELECT extract_schema('ftp:/example.com');
9595
----
9696
(empty)
9797

98+
# ---------------------------- RSYNC ----------------------------
99+
100+
query I
101+
SELECT extract_schema('rsync://rpki.example.com');
102+
----
103+
rsync
104+
98105
# ---------------------------- TEL ----------------------------
99106

100107
query I

test/sql/extract_subdomain.test

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,16 @@ SELECT extract_subdomain('https://b.a.example.com.ac:443/path');
7474
----
7575
b.a
7676

77+
query I
78+
SELECT extract_subdomain('rsync://rpki.example.com/path');
79+
----
80+
rpki
81+
82+
query I
83+
SELECT extract_subdomain('ftp://ftp1.example.com/path');
84+
----
85+
ftp1
86+
7787
query I
7888
SELECT extract_subdomain('com.ac');
7989
----

test/sql/extract_tld.test

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,16 @@ SELECT extract_tld('https://example.com.ac:443/path/path');
124124
----
125125
com.ac
126126

127+
query I
128+
SELECT extract_tld('ftp://ftp.example.com/path/path');
129+
----
130+
com
131+
132+
query I
133+
SELECT extract_tld('rsync://rpki.example.com/path/path');
134+
----
135+
com
136+
127137

128138
query I
129139
SELECT extract_tld('com.ac');

0 commit comments

Comments
 (0)