Skip to content

Commit 62ab352

Browse files
[opt](storage) Add log and metric when aws/azure sdk do retry operation
* Print azure sdk log to `be.INFO` * Set aws_log_level=3 (WARN) and azure_log_level=2 (Warning) by default, because when request failed, we don't know what happened in previously * Set azure MaxRetries equal to config::max_s3_client_retry(10)
1 parent 5676379 commit 62ab352

File tree

10 files changed

+141
-33
lines changed

10 files changed

+141
-33
lines changed

be/src/common/config.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -833,7 +833,18 @@ DEFINE_mInt32(zone_map_row_num_threshold, "20");
833833
// Info = 4,
834834
// Debug = 5,
835835
// Trace = 6
836-
DEFINE_Int32(aws_log_level, "2");
836+
DEFINE_Int32(aws_log_level, "3");
837+
DEFINE_Validator(aws_log_level,
838+
[](const int config) -> bool { return config >= 0 && config <= 6; });
839+
840+
// azure sdk log level
841+
// Verbose = 1,
842+
// Informational = 2,
843+
// Warning = 3,
844+
// Error = 4
845+
DEFINE_Int32(azure_log_level, "3");
846+
DEFINE_Validator(azure_log_level,
847+
[](const int config) -> bool { return config >= 1 && config <= 4; });
837848

838849
// the buffer size when read data from remote storage like s3
839850
DEFINE_mInt32(remote_storage_read_buffer_mb, "16");

be/src/common/config.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,13 @@ DECLARE_mInt32(zone_map_row_num_threshold);
877877
// Trace = 6
878878
DECLARE_Int32(aws_log_level);
879879

880+
// azure sdk log level
881+
// Verbose = 1,
882+
// Informational = 2,
883+
// Warning = 3,
884+
// Error = 4
885+
DECLARE_Int32(azure_log_level);
886+
880887
// the buffer size when read data from remote storage like s3
881888
DECLARE_mInt32(remote_storage_read_buffer_mb);
882889

be/src/util/s3_util.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
#include <atomic>
3434
#ifdef USE_AZURE
35+
#include <azure/core/diagnostics/logger.hpp>
3536
#include <azure/storage/blobs/blob_container_client.hpp>
3637
#endif
3738
#include <cstdlib>
@@ -150,6 +151,33 @@ S3ClientFactory::S3ClientFactory() {
150151
config::s3_put_token_per_second, config::s3_put_bucket_tokens,
151152
config::s3_put_token_limit,
152153
metric_func_factory(put_rate_limit_ns, put_rate_limit_exceed_req_num))};
154+
155+
#ifdef USE_AZURE
156+
auto azureLogLevel =
157+
static_cast<Azure::Core::Diagnostics::Logger::Level>(config::azure_log_level);
158+
Azure::Core::Diagnostics::Logger::SetLevel(azureLogLevel);
159+
Azure::Core::Diagnostics::Logger::SetListener(
160+
[&](Azure::Core::Diagnostics::Logger::Level level, const std::string& message) {
161+
switch (level) {
162+
case Azure::Core::Diagnostics::Logger::Level::Verbose:
163+
LOG(INFO) << message;
164+
break;
165+
case Azure::Core::Diagnostics::Logger::Level::Informational:
166+
LOG(INFO) << message;
167+
break;
168+
case Azure::Core::Diagnostics::Logger::Level::Warning:
169+
LOG(WARNING) << message;
170+
break;
171+
case Azure::Core::Diagnostics::Logger::Level::Error:
172+
LOG(ERROR) << message;
173+
break;
174+
default:
175+
LOG(WARNING) << "Unknown level: " << static_cast<int>(level)
176+
<< ", message: " << message;
177+
break;
178+
}
179+
});
180+
#endif
153181
}
154182

155183
S3ClientFactory::~S3ClientFactory() {
@@ -204,7 +232,13 @@ std::shared_ptr<io::ObjStorageClient> S3ClientFactory::_create_azure_client(
204232
}
205233
}
206234

207-
auto containerClient = std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(uri, cred);
235+
Azure::Storage::Blobs::BlobClientOptions options;
236+
options.Retry.StatusCodes.insert(Azure::Core::Http::HttpStatusCode::TooManyRequests);
237+
options.Retry.MaxRetries = config::max_s3_client_retry;
238+
options.PerRetryPolicies.emplace_back(std::make_unique<AzureRetryRecordPolicy>());
239+
240+
auto containerClient = std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(
241+
uri, cred, std::move(options));
208242
LOG_INFO("create one azure client with {}", s3_conf.to_string());
209243
return std::make_shared<io::AzureObjStorageClient>(std::move(containerClient));
210244
#else

be/test/io/fs/azure_obj_storage_client_test.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
#include "io/fs/file_system.h"
2323
#include "io/fs/obj_storage_client.h"
24+
#include "util/s3_util.h"
2425

2526
#ifdef USE_AZURE
2627
#include <azure/storage/blobs.hpp>
@@ -49,13 +50,16 @@ class AzureObjStorageClientTest : public testing::Test {
4950
std::string accountKey = std::getenv("AZURE_ACCOUNT_KEY");
5051
std::string containerName = std::getenv("AZURE_CONTAINER_NAME");
5152

52-
auto cred = std::make_shared<Azure::Storage::StorageSharedKeyCredential>(accountName,
53-
accountKey);
54-
const std::string uri =
55-
fmt::format("https://{}.blob.core.windows.net/{}", accountName, containerName);
56-
auto containerClient = std::make_shared<BlobContainerClient>(uri, cred);
57-
AzureObjStorageClientTest::obj_storage_client =
58-
std::make_shared<io::AzureObjStorageClient>(std::move(containerClient));
53+
// Initialize Azure SDK
54+
[[maybe_unused]] auto& s3ClientFactory = S3ClientFactory::instance();
55+
56+
AzureObjStorageClientTest::obj_storage_client = S3ClientFactory::instance().create(
57+
{.endpoint = fmt::format("https://{}.blob.core.windows.net", accountName),
58+
.region = "dummy-region",
59+
.ak = accountName,
60+
.sk = accountKey,
61+
.bucket = containerName,
62+
.provider = io::ObjStorageType::AZURE});
5963
}
6064

6165
void SetUp() override {

cloud/src/common/config.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,17 @@ CONF_mBool(enable_batch_get_mow_tablet_stats_and_meta, "true");
307307
// Info = 4,
308308
// Debug = 5,
309309
// Trace = 6
310-
CONF_Int32(aws_log_level, "2");
310+
CONF_Int32(aws_log_level, "3");
311+
CONF_Validator(aws_log_level, [](const int config) -> bool { return config >= 0 && config <= 6; });
312+
313+
// azure sdk log level
314+
// Verbose = 1,
315+
// Informational = 2,
316+
// Warning = 3,
317+
// Error = 4
318+
CONF_Int32(azure_log_level, "3");
319+
CONF_Validator(azure_log_level,
320+
[](const int config) -> bool { return config >= 1 && config <= 4; });
311321

312322
// ca_cert_file is in this path by default, Normally no modification is required
313323
// ca cert default path is different from different OS

cloud/src/recycler/s3_accessor.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
#include <algorithm>
3131
#ifdef USE_AZURE
32+
#include <azure/core/diagnostics/logger.hpp>
3233
#include <azure/storage/blobs/blob_container_client.hpp>
3334
#include <azure/storage/common/storage_credential.hpp>
3435
#endif
@@ -120,6 +121,33 @@ class S3Environment {
120121
return std::make_shared<DorisAWSLogger>(logLevel);
121122
};
122123
Aws::InitAPI(aws_options_);
124+
125+
#ifdef USE_AZURE
126+
auto azureLogLevel =
127+
static_cast<Azure::Core::Diagnostics::Logger::Level>(config::azure_log_level);
128+
Azure::Core::Diagnostics::Logger::SetLevel(azureLogLevel);
129+
Azure::Core::Diagnostics::Logger::SetListener(
130+
[&](Azure::Core::Diagnostics::Logger::Level level, const std::string& message) {
131+
switch (level) {
132+
case Azure::Core::Diagnostics::Logger::Level::Verbose:
133+
LOG(INFO) << message;
134+
break;
135+
case Azure::Core::Diagnostics::Logger::Level::Informational:
136+
LOG(INFO) << message;
137+
break;
138+
case Azure::Core::Diagnostics::Logger::Level::Warning:
139+
LOG(WARNING) << message;
140+
break;
141+
case Azure::Core::Diagnostics::Logger::Level::Error:
142+
LOG(ERROR) << message;
143+
break;
144+
default:
145+
LOG(WARNING) << "Unknown level: " << static_cast<int>(level)
146+
<< ", message: " << message;
147+
break;
148+
}
149+
});
150+
#endif
123151
}
124152

125153
~S3Environment() { Aws::ShutdownAPI(aws_options_); }
@@ -308,8 +336,7 @@ int S3Accessor::init() {
308336
// Within the RetryPolicy, the nextPolicy is called multiple times inside a loop.
309337
// All policies in the PerRetryPolicies are downstream of the RetryPolicy.
310338
// Therefore, you only need to add a policy to check if the response code is 429 and if the retry count meets the condition, it can record the retry count.
311-
options.PerRetryPolicies.emplace_back(
312-
std::make_unique<AzureRetryRecordPolicy>(config::max_s3_client_retry));
339+
options.PerRetryPolicies.emplace_back(std::make_unique<AzureRetryRecordPolicy>());
313340
auto container_client = std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(
314341
uri_, cred, std::move(options));
315342
// uri format for debug: ${scheme}://${ak}.blob.core.windows.net/${bucket}/${prefix}

common/cpp/aws_logger.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
#include <aws/core/utils/logging/LogLevel.h>
2121
#include <aws/core/utils/logging/LogSystemInterface.h>
22-
#include <glog/logging.h> // IWYU pragma: export
22+
#include <glog/logging.h>
2323

2424
class DorisAWSLogger final : public Aws::Utils::Logging::LogSystemInterface {
2525
public:

common/cpp/obj_retry_strategy.cpp

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@
1919

2020
#include <aws/core/http/HttpResponse.h>
2121
#include <bvar/reducer.h>
22+
#include <glog/logging.h>
2223

2324
namespace doris {
2425

25-
bvar::Adder<int64_t> s3_too_many_request_retry_cnt("s3_too_many_request_retry_cnt");
26+
bvar::Adder<int64_t> object_request_retry_count("object_request_retry_count");
2627

2728
S3CustomRetryStrategy::S3CustomRetryStrategy(int maxRetries) : DefaultRetryStrategy(maxRetries) {}
2829

@@ -35,33 +36,42 @@ bool S3CustomRetryStrategy::ShouldRetry(const Aws::Client::AWSError<Aws::Client:
3536
}
3637

3738
if (Aws::Http::IsRetryableHttpResponseCode(error.GetResponseCode()) || error.ShouldRetry()) {
38-
s3_too_many_request_retry_cnt << 1;
39+
object_request_retry_count << 1;
40+
LOG(INFO) << "retry due to error: " << error << ", attempt: " << attemptedRetries + 1 << "/"
41+
<< m_maxRetries;
3942
return true;
4043
}
4144

4245
return false;
4346
}
4447
#ifdef USE_AZURE
45-
AzureRetryRecordPolicy::AzureRetryRecordPolicy(int retry_cnt) : retry_cnt(retry_cnt) {}
46-
47-
AzureRetryRecordPolicy::~AzureRetryRecordPolicy() = default;
4848

4949
std::unique_ptr<Azure::Core::Http::RawResponse> AzureRetryRecordPolicy::Send(
5050
Azure::Core::Http::Request& request, Azure::Core::Http::Policies::NextHttpPolicy nextPolicy,
5151
Azure::Core::Context const& context) const {
52-
auto resp = nextPolicy.Send(request, context);
53-
if (retry_cnt != 0 &&
54-
resp->GetStatusCode() == Azure::Core::Http::HttpStatusCode::TooManyRequests) {
55-
retry_cnt--;
56-
s3_too_many_request_retry_cnt << 1;
52+
// https://learn.microsoft.com/en-us/azure/developer/cpp/sdk/fundamentals/http-pipelines-and-retries
53+
54+
std::unique_ptr<Azure::Core::Http::RawResponse> response = nextPolicy.Send(request, context);
55+
int32_t retry_count =
56+
Azure::Core::Http::Policies::_internal::RetryPolicy::GetRetryCount(context);
57+
58+
if (static_cast<int>(response->GetStatusCode()) > 299 ||
59+
static_cast<int>(response->GetStatusCode()) < 200) {
60+
if (retry_count > 0) {
61+
object_request_retry_count << 1;
62+
}
63+
64+
// If the response is not successful, we log the retry attempt and status code.
65+
LOG(INFO) << "azure retry retry_count: " << retry_count
66+
<< ", status code: " << static_cast<int>(response->GetStatusCode())
67+
<< ", reason: " << response->GetReasonPhrase();
5768
}
58-
return resp;
69+
70+
return response;
5971
}
6072

6173
std::unique_ptr<AzureRetryRecordPolicy::HttpPolicy> AzureRetryRecordPolicy::Clone() const {
62-
auto ret = std::make_unique<AzureRetryRecordPolicy>(*this);
63-
ret->retry_cnt = 0;
64-
return ret;
74+
return std::make_unique<AzureRetryRecordPolicy>(*this);
6575
}
6676
#endif
6777
} // namespace doris

common/cpp/obj_retry_strategy.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,14 @@ class S3CustomRetryStrategy final : public Aws::Client::DefaultRetryStrategy {
3737
#ifdef USE_AZURE
3838
class AzureRetryRecordPolicy final : public Azure::Core::Http::Policies::HttpPolicy {
3939
public:
40-
AzureRetryRecordPolicy(int retry_cnt);
41-
~AzureRetryRecordPolicy() override;
40+
AzureRetryRecordPolicy() = default;
41+
~AzureRetryRecordPolicy() override = default;
42+
4243
std::unique_ptr<HttpPolicy> Clone() const override;
4344
std::unique_ptr<Azure::Core::Http::RawResponse> Send(
4445
Azure::Core::Http::Request& request,
4546
Azure::Core::Http::Policies::NextHttpPolicy nextPolicy,
4647
Azure::Core::Context const& context) const override;
47-
48-
private:
49-
mutable int retry_cnt;
5048
};
5149
#endif
5250
} // namespace doris

conf/be.conf

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,13 @@ sys_log_level = INFO
8585
# Debug = 5,
8686
# Trace = 6
8787
# Default to turn off aws sdk log, because aws sdk errors that need to be cared will be output through Doris logs
88-
aws_log_level=2
88+
aws_log_level = 3
89+
90+
# azure sdk log level
91+
# Verbose = 1,
92+
# Informational = 2,
93+
# Warning = 3,
94+
# Error = 4
95+
azure_log_level = 3
8996
## If you are not running in aws cloud, you can disable EC2 metadata
9097
AWS_EC2_METADATA_DISABLED=true

0 commit comments

Comments
 (0)