Skip to content

Commit 7d6fd7a

Browse files
authored
chore: update benchmark logic to respect allow_large_results=False (#1545)
* chore: update benchmark logic for 2.0 * update benchmark logic * fix
1 parent 4a7fe4d commit 7d6fd7a

File tree

3 files changed

+105
-64
lines changed

3 files changed

+105
-64
lines changed

bigframes/session/_io/bigquery/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def start_query_with_client(
247247
api_timeout=timeout,
248248
)
249249
if metrics is not None:
250-
metrics.count_job_stats()
250+
metrics.count_job_stats(query=sql)
251251
return results_iterator, None
252252

253253
query_job = bq_client.query(

bigframes/session/metrics.py

+37-23
Original file line numberDiff line numberDiff line change
@@ -32,29 +32,35 @@ class ExecutionMetrics:
3232
execution_secs: float = 0
3333
query_char_count: int = 0
3434

35-
def count_job_stats(self, query_job: Optional[bq_job.QueryJob] = None):
35+
def count_job_stats(
36+
self, query_job: Optional[bq_job.QueryJob] = None, query: str = ""
37+
):
3638
if query_job is None:
39+
query_char_count = len(query)
3740
self.execution_count += 1
41+
self.query_char_count += query_char_count
42+
if LOGGING_NAME_ENV_VAR in os.environ:
43+
write_stats_to_disk(query_char_count)
3844
return
3945

4046
stats = get_performance_stats(query_job)
4147
if stats is not None:
42-
bytes_processed, slot_millis, execution_secs, query_char_count = stats
48+
query_char_count, bytes_processed, slot_millis, execution_secs = stats
4349
self.execution_count += 1
50+
self.query_char_count += query_char_count
4451
self.bytes_processed += bytes_processed
4552
self.slot_millis += slot_millis
4653
self.execution_secs += execution_secs
47-
self.query_char_count += query_char_count
4854
if LOGGING_NAME_ENV_VAR in os.environ:
4955
# when running notebooks via pytest nbmake
5056
write_stats_to_disk(
51-
bytes_processed, slot_millis, execution_secs, query_char_count
57+
query_char_count, bytes_processed, slot_millis, execution_secs
5258
)
5359

5460

5561
def get_performance_stats(
5662
query_job: bigquery.QueryJob,
57-
) -> Optional[Tuple[int, int, float, int]]:
63+
) -> Optional[Tuple[int, int, int, float]]:
5864
"""Parse the query job for performance stats.
5965
6066
Return None if the stats do not reflect real work done in bigquery.
@@ -77,11 +83,14 @@ def get_performance_stats(
7783
execution_secs = (query_job.ended - query_job.created).total_seconds()
7884
query_char_count = len(query_job.query)
7985

80-
return bytes_processed, slot_millis, execution_secs, query_char_count
86+
return query_char_count, bytes_processed, slot_millis, execution_secs
8187

8288

8389
def write_stats_to_disk(
84-
bytes_processed: int, slot_millis: int, exec_seconds: float, query_char_count: int
90+
query_char_count: int,
91+
bytes_processed: Optional[int] = None,
92+
slot_millis: Optional[int] = None,
93+
exec_seconds: Optional[float] = None,
8594
):
8695
"""For pytest runs only, log information about the query job
8796
to a file in order to create a performance report.
@@ -95,22 +104,27 @@ def write_stats_to_disk(
95104
test_name = os.environ[LOGGING_NAME_ENV_VAR]
96105
current_directory = os.getcwd()
97106

98-
# store bytes processed
99-
bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed")
100-
with open(bytes_file, "a") as f:
101-
f.write(str(bytes_processed) + "\n")
102-
103-
# store slot milliseconds
104-
slot_file = os.path.join(current_directory, test_name + ".slotmillis")
105-
with open(slot_file, "a") as f:
106-
f.write(str(slot_millis) + "\n")
107-
108-
# store execution time seconds
109-
exec_time_file = os.path.join(
110-
current_directory, test_name + ".bq_exec_time_seconds"
111-
)
112-
with open(exec_time_file, "a") as f:
113-
f.write(str(exec_seconds) + "\n")
107+
if (
108+
(bytes_processed is not None)
109+
and (slot_millis is not None)
110+
and (exec_seconds is not None)
111+
):
112+
# store bytes processed
113+
bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed")
114+
with open(bytes_file, "a") as f:
115+
f.write(str(bytes_processed) + "\n")
116+
117+
# store slot milliseconds
118+
slot_file = os.path.join(current_directory, test_name + ".slotmillis")
119+
with open(slot_file, "a") as f:
120+
f.write(str(slot_millis) + "\n")
121+
122+
# store execution time seconds
123+
exec_time_file = os.path.join(
124+
current_directory, test_name + ".bq_exec_time_seconds"
125+
)
126+
with open(exec_time_file, "a") as f:
127+
f.write(str(exec_seconds) + "\n")
114128

115129
# store length of query
116130
query_char_count_file = os.path.join(

scripts/run_and_publish_benchmark.py

+67-40
Original file line numberDiff line numberDiff line change
@@ -95,55 +95,62 @@ def collect_benchmark_result(
9595
if not (
9696
len(bytes_files)
9797
== len(millis_files)
98-
== len(local_seconds_files)
9998
== len(bq_seconds_files)
100-
== len(query_char_count_files)
99+
<= len(query_char_count_files)
100+
== len(local_seconds_files)
101101
):
102102
raise ValueError(
103103
"Mismatch in the number of report files for bytes, millis, seconds and query char count."
104104
)
105105

106-
for idx in range(len(bytes_files)):
107-
bytes_file = bytes_files[idx]
108-
millis_file = millis_files[idx]
109-
bq_seconds_file = bq_seconds_files[idx]
110-
query_char_count_file = query_char_count_files[idx]
111-
112-
filename = bytes_file.relative_to(path).with_suffix("")
113-
114-
if filename != millis_file.relative_to(path).with_suffix(
115-
""
116-
) or filename != bq_seconds_file.relative_to(path).with_suffix(""):
117-
raise ValueError(
118-
"File name mismatch among bytes, millis, and seconds reports."
119-
)
106+
has_full_metrics = len(bq_seconds_files) == len(local_seconds_files)
120107

108+
for idx in range(len(local_seconds_files)):
109+
query_char_count_file = query_char_count_files[idx]
121110
local_seconds_file = local_seconds_files[idx]
111+
filename = query_char_count_file.relative_to(path).with_suffix("")
122112
if filename != local_seconds_file.relative_to(path).with_suffix(""):
123113
raise ValueError(
124-
"File name mismatch among bytes, millis, and seconds reports."
114+
"File name mismatch between query_char_count and seconds reports."
125115
)
126116

127-
with open(bytes_file, "r") as file:
117+
with open(query_char_count_file, "r") as file:
128118
lines = file.read().splitlines()
119+
query_char_count = sum(int(line) for line in lines) / iterations
129120
query_count = len(lines) / iterations
130-
total_bytes = sum(int(line) for line in lines) / iterations
131-
132-
with open(millis_file, "r") as file:
133-
lines = file.read().splitlines()
134-
total_slot_millis = sum(int(line) for line in lines) / iterations
135121

136122
with open(local_seconds_file, "r") as file:
137123
lines = file.read().splitlines()
138124
local_seconds = sum(float(line) for line in lines) / iterations
139125

140-
with open(bq_seconds_file, "r") as file:
141-
lines = file.read().splitlines()
142-
bq_seconds = sum(float(line) for line in lines) / iterations
126+
if not has_full_metrics:
127+
total_bytes = None
128+
total_slot_millis = None
129+
bq_seconds = None
130+
else:
131+
bytes_file = bytes_files[idx]
132+
millis_file = millis_files[idx]
133+
bq_seconds_file = bq_seconds_files[idx]
134+
if (
135+
filename != bytes_file.relative_to(path).with_suffix("")
136+
or filename != millis_file.relative_to(path).with_suffix("")
137+
or filename != bq_seconds_file.relative_to(path).with_suffix("")
138+
):
139+
raise ValueError(
140+
"File name mismatch among query_char_count, bytes, millis, and seconds reports."
141+
)
143142

144-
with open(query_char_count_file, "r") as file:
145-
lines = file.read().splitlines()
146-
query_char_count = sum(int(line) for line in lines) / iterations
143+
with open(bytes_file, "r") as file:
144+
lines = file.read().splitlines()
145+
total_bytes = sum(int(line) for line in lines) / iterations
146+
147+
with open(millis_file, "r") as file:
148+
lines = file.read().splitlines()
149+
total_slot_millis = sum(int(line) for line in lines) / iterations
150+
151+
with open(bq_seconds_file, "r") as file:
152+
lines = file.read().splitlines()
153+
bq_seconds = sum(float(line) for line in lines) / iterations
147154

148155
results_dict[str(filename)] = [
149156
query_count,
@@ -194,11 +201,19 @@ def collect_benchmark_result(
194201
)
195202
print(
196203
f"{index} - query count: {row['Query_Count']},"
197-
f" query char count: {row['Query_Char_Count']},",
198-
f" bytes processed sum: {row['Bytes_Processed']},"
199-
f" slot millis sum: {row['Slot_Millis']},"
200-
f" local execution time: {formatted_local_exec_time} seconds,"
201-
f" bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds",
204+
+ f" query char count: {row['Query_Char_Count']},"
205+
+ (
206+
f" bytes processed sum: {row['Bytes_Processed']},"
207+
if has_full_metrics
208+
else ""
209+
)
210+
+ (f" slot millis sum: {row['Slot_Millis']}," if has_full_metrics else "")
211+
+ f" local execution time: {formatted_local_exec_time} seconds"
212+
+ (
213+
f", bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds"
214+
if has_full_metrics
215+
else ""
216+
)
202217
)
203218

204219
geometric_mean_queries = geometric_mean_excluding_zeros(
@@ -221,12 +236,24 @@ def collect_benchmark_result(
221236
)
222237

223238
print(
224-
f"---Geometric mean of queries: {geometric_mean_queries}, "
225-
f"Geometric mean of queries char counts: {geometric_mean_query_char_count}, "
226-
f"Geometric mean of bytes processed: {geometric_mean_bytes}, "
227-
f"Geometric mean of slot millis: {geometric_mean_slot_millis}, "
228-
f"Geometric mean of local execution time: {geometric_mean_local_seconds} seconds, "
229-
f"Geometric mean of BigQuery execution time: {geometric_mean_bq_seconds} seconds---"
239+
f"---Geometric mean of queries: {geometric_mean_queries},"
240+
+ f" Geometric mean of queries char counts: {geometric_mean_query_char_count},"
241+
+ (
242+
f" Geometric mean of bytes processed: {geometric_mean_bytes},"
243+
if has_full_metrics
244+
else ""
245+
)
246+
+ (
247+
f" Geometric mean of slot millis: {geometric_mean_slot_millis},"
248+
if has_full_metrics
249+
else ""
250+
)
251+
+ f" Geometric mean of local execution time: {geometric_mean_local_seconds} seconds"
252+
+ (
253+
f", Geometric mean of BigQuery execution time: {geometric_mean_bq_seconds} seconds---"
254+
if has_full_metrics
255+
else ""
256+
)
230257
)
231258

232259
error_message = (

0 commit comments

Comments
 (0)