Skip to content

Commit 69e0626

Browse files
authored
feat: add get_cc_select_html (#496)
1 parent f5502b4 commit 69e0626

File tree

6 files changed

+156
-357
lines changed

6 files changed

+156
-357
lines changed

jupyter/main-html-dedup/cc_dedup_fir.ipynb

Lines changed: 4 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,7 @@
44
"cell_type": "code",
55
"execution_count": null,
66
"id": "0",
7-
"metadata": {
8-
"execution": {
9-
"iopub.execute_input": "2025-07-11T02:24:06.149493Z",
10-
"iopub.status.busy": "2025-07-11T02:24:06.149207Z",
11-
"iopub.status.idle": "2025-07-11T02:24:14.378413Z",
12-
"shell.execute_reply": "2025-07-11T02:24:14.377755Z",
13-
"shell.execute_reply.started": "2025-07-11T02:24:06.149476Z"
14-
}
15-
},
7+
"metadata": {},
168
"outputs": [],
179
"source": [
1810
"from pyspark.sql import Row\n",
@@ -57,15 +49,7 @@
5749
"cell_type": "code",
5850
"execution_count": null,
5951
"id": "2",
60-
"metadata": {
61-
"execution": {
62-
"iopub.execute_input": "2025-07-11T02:24:14.379728Z",
63-
"iopub.status.busy": "2025-07-11T02:24:14.379322Z",
64-
"iopub.status.idle": "2025-07-11T02:24:14.382394Z",
65-
"shell.execute_reply": "2025-07-11T02:24:14.381923Z",
66-
"shell.execute_reply.started": "2025-07-11T02:24:14.379710Z"
67-
}
68-
},
52+
"metadata": {},
6953
"outputs": [],
7054
"source": [
7155
"# 获取 cc warc path list\n",
@@ -81,13 +65,6 @@
8165
"id": "3",
8266
"metadata": {
8367
"editable": true,
84-
"execution": {
85-
"iopub.execute_input": "2025-07-11T02:24:14.383374Z",
86-
"iopub.status.busy": "2025-07-11T02:24:14.383061Z",
87-
"iopub.status.idle": "2025-07-11T02:24:14.397134Z",
88-
"shell.execute_reply": "2025-07-11T02:24:14.396661Z",
89-
"shell.execute_reply.started": "2025-07-11T02:24:14.383359Z"
90-
},
9168
"slideshow": {
9269
"slide_type": ""
9370
},
@@ -185,15 +162,7 @@
185162
"cell_type": "code",
186163
"execution_count": null,
187164
"id": "4",
188-
"metadata": {
189-
"execution": {
190-
"iopub.execute_input": "2025-07-11T02:24:14.397836Z",
191-
"iopub.status.busy": "2025-07-11T02:24:14.397698Z",
192-
"iopub.status.idle": "2025-07-11T02:24:45.430826Z",
193-
"shell.execute_reply": "2025-07-11T02:24:45.429906Z",
194-
"shell.execute_reply.started": "2025-07-11T02:24:14.397823Z"
195-
}
196-
},
165+
"metadata": {},
197166
"outputs": [],
198167
"source": [
199168
"# mapPartitions 对 warc path 并行解析数据\n",
@@ -217,15 +186,7 @@
217186
"cell_type": "code",
218187
"execution_count": null,
219188
"id": "6",
220-
"metadata": {
221-
"execution": {
222-
"iopub.execute_input": "2025-07-11T02:24:52.435257Z",
223-
"iopub.status.busy": "2025-07-11T02:24:52.434949Z",
224-
"iopub.status.idle": "2025-07-11T02:26:00.233117Z",
225-
"shell.execute_reply": "2025-07-11T02:26:00.232529Z",
226-
"shell.execute_reply.started": "2025-07-11T02:24:52.435240Z"
227-
}
228-
},
189+
"metadata": {},
229190
"outputs": [],
230191
"source": [
231192
"config[\"skip_output_version\"] = True\n",

jupyter/main-html-dedup/cc_dedup_sec.ipynb

Lines changed: 21 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,7 @@
44
"cell_type": "code",
55
"execution_count": null,
66
"id": "0",
7-
"metadata": {
8-
"execution": {
9-
"iopub.status.idle": "2025-07-11T02:26:43.748017Z",
10-
"shell.execute_reply": "2025-07-11T02:26:43.747366Z",
11-
"shell.execute_reply.started": "2025-07-11T02:26:34.645411Z"
12-
}
13-
},
7+
"metadata": {},
148
"outputs": [],
159
"source": [
1610
"from xinghe.spark import *\n",
@@ -50,15 +44,7 @@
5044
"cell_type": "code",
5145
"execution_count": null,
5246
"id": "2",
53-
"metadata": {
54-
"execution": {
55-
"iopub.execute_input": "2025-07-11T02:26:43.749268Z",
56-
"iopub.status.busy": "2025-07-11T02:26:43.748929Z",
57-
"iopub.status.idle": "2025-07-11T02:26:43.751756Z",
58-
"shell.execute_reply": "2025-07-11T02:26:43.751304Z",
59-
"shell.execute_reply.started": "2025-07-11T02:26:43.749250Z"
60-
}
61-
},
47+
"metadata": {},
6248
"outputs": [],
6349
"source": [
6450
"#dump_paths = []\n",
@@ -71,15 +57,7 @@
7157
"cell_type": "code",
7258
"execution_count": null,
7359
"id": "3",
74-
"metadata": {
75-
"execution": {
76-
"iopub.execute_input": "2025-07-11T02:26:43.752632Z",
77-
"iopub.status.busy": "2025-07-11T02:26:43.752346Z",
78-
"iopub.status.idle": "2025-07-11T02:26:43.762655Z",
79-
"shell.execute_reply": "2025-07-11T02:26:43.762229Z",
80-
"shell.execute_reply.started": "2025-07-11T02:26:43.752617Z"
81-
}
82-
},
60+
"metadata": {},
8361
"outputs": [],
8462
"source": [
8563
"#input_df = spark.read.format(\"json\").load(dump_paths)"
@@ -97,15 +75,7 @@
9775
"cell_type": "code",
9876
"execution_count": null,
9977
"id": "5",
100-
"metadata": {
101-
"execution": {
102-
"iopub.execute_input": "2025-07-11T02:26:43.763518Z",
103-
"iopub.status.busy": "2025-07-11T02:26:43.763240Z",
104-
"iopub.status.idle": "2025-07-11T02:26:43.770017Z",
105-
"shell.execute_reply": "2025-07-11T02:26:43.769479Z",
106-
"shell.execute_reply.started": "2025-07-11T02:26:43.763503Z"
107-
}
108-
},
78+
"metadata": {},
10979
"outputs": [],
11080
"source": [
11181
"#already_exist_id_v_df = read_any_path(spark, already_exist_id_path, config)"
@@ -115,15 +85,7 @@
11585
"cell_type": "code",
11686
"execution_count": null,
11787
"id": "6",
118-
"metadata": {
119-
"execution": {
120-
"iopub.execute_input": "2025-07-11T02:26:43.772007Z",
121-
"iopub.status.busy": "2025-07-11T02:26:43.771594Z",
122-
"iopub.status.idle": "2025-07-11T02:26:43.776268Z",
123-
"shell.execute_reply": "2025-07-11T02:26:43.775843Z",
124-
"shell.execute_reply.started": "2025-07-11T02:26:43.771991Z"
125-
}
126-
},
88+
"metadata": {},
12789
"outputs": [],
12890
"source": [
12991
"# 定义 Schema\n",
@@ -147,15 +109,7 @@
147109
"cell_type": "code",
148110
"execution_count": null,
149111
"id": "8",
150-
"metadata": {
151-
"execution": {
152-
"iopub.execute_input": "2025-07-11T02:26:43.776944Z",
153-
"iopub.status.busy": "2025-07-11T02:26:43.776805Z",
154-
"iopub.status.idle": "2025-07-11T02:26:43.784766Z",
155-
"shell.execute_reply": "2025-07-11T02:26:43.784336Z",
156-
"shell.execute_reply.started": "2025-07-11T02:26:43.776931Z"
157-
}
158-
},
112+
"metadata": {},
159113
"outputs": [],
160114
"source": [
161115
"#join_df = input_df.join(already_exist_id_df, on=\"hash_html\", how=\"left\")\n",
@@ -173,16 +127,8 @@
173127
{
174128
"cell_type": "code",
175129
"execution_count": null,
176-
"id": "da837b3e-5eee-43b1-9c77-836aac1203df",
177-
"metadata": {
178-
"execution": {
179-
"iopub.execute_input": "2025-07-11T02:26:43.785611Z",
180-
"iopub.status.busy": "2025-07-11T02:26:43.785331Z",
181-
"iopub.status.idle": "2025-07-11T02:27:13.606489Z",
182-
"shell.execute_reply": "2025-07-11T02:27:13.605744Z",
183-
"shell.execute_reply.started": "2025-07-11T02:26:43.785596Z"
184-
}
185-
},
130+
"id": "10",
131+
"metadata": {},
186132
"outputs": [],
187133
"source": [
188134
"input_df = read_any_path(spark, \",\".join(base_input_path), config)"
@@ -191,16 +137,8 @@
191137
{
192138
"cell_type": "code",
193139
"execution_count": null,
194-
"id": "20135ae2-b1d0-43c9-b241-8d23de94f01d",
195-
"metadata": {
196-
"execution": {
197-
"iopub.execute_input": "2025-07-11T02:27:13.608658Z",
198-
"iopub.status.busy": "2025-07-11T02:27:13.607902Z",
199-
"iopub.status.idle": "2025-07-11T02:27:13.612633Z",
200-
"shell.execute_reply": "2025-07-11T02:27:13.611707Z",
201-
"shell.execute_reply.started": "2025-07-11T02:27:13.608633Z"
202-
}
203-
},
140+
"id": "11",
141+
"metadata": {},
204142
"outputs": [],
205143
"source": [
206144
"def json_data(row_iter)->Row:\n",
@@ -212,16 +150,8 @@
212150
{
213151
"cell_type": "code",
214152
"execution_count": null,
215-
"id": "0c0435c3-bd5e-4571-ab29-90c522023dec",
216-
"metadata": {
217-
"execution": {
218-
"iopub.execute_input": "2025-07-11T02:27:13.614260Z",
219-
"iopub.status.busy": "2025-07-11T02:27:13.613866Z",
220-
"iopub.status.idle": "2025-07-11T02:27:17.465595Z",
221-
"shell.execute_reply": "2025-07-11T02:27:17.464788Z",
222-
"shell.execute_reply.started": "2025-07-11T02:27:13.614237Z"
223-
}
224-
},
153+
"id": "12",
154+
"metadata": {},
225155
"outputs": [],
226156
"source": [
227157
"undedup_id_df = input_df.rdd.mapPartitions(json_data).toDF()"
@@ -230,16 +160,8 @@
230160
{
231161
"cell_type": "code",
232162
"execution_count": null,
233-
"id": "10",
234-
"metadata": {
235-
"execution": {
236-
"iopub.execute_input": "2025-07-11T02:27:17.466995Z",
237-
"iopub.status.busy": "2025-07-11T02:27:17.466683Z",
238-
"iopub.status.idle": "2025-07-11T02:27:17.526987Z",
239-
"shell.execute_reply": "2025-07-11T02:27:17.526167Z",
240-
"shell.execute_reply.started": "2025-07-11T02:27:17.466977Z"
241-
}
242-
},
163+
"id": "13",
164+
"metadata": {},
243165
"outputs": [],
244166
"source": [
245167
"def deduplicate_partition(partition):\n",
@@ -251,24 +173,16 @@
251173
{
252174
"cell_type": "code",
253175
"execution_count": null,
254-
"id": "11",
255-
"metadata": {
256-
"execution": {
257-
"iopub.execute_input": "2025-07-11T02:27:17.528440Z",
258-
"iopub.status.busy": "2025-07-11T02:27:17.528025Z",
259-
"iopub.status.idle": "2025-07-11T02:27:17.537970Z",
260-
"shell.execute_reply": "2025-07-11T02:27:17.537362Z",
261-
"shell.execute_reply.started": "2025-07-11T02:27:17.528418Z"
262-
}
263-
},
176+
"id": "14",
177+
"metadata": {},
264178
"outputs": [],
265179
"source": [
266180
"dedup_df = dedup_part_df.dropDuplicates([\"hash_html\"])"
267181
]
268182
},
269183
{
270184
"cell_type": "markdown",
271-
"id": "12",
185+
"id": "15",
272186
"metadata": {},
273187
"source": [
274188
"# 写出s3"
@@ -277,16 +191,8 @@
277191
{
278192
"cell_type": "code",
279193
"execution_count": null,
280-
"id": "13",
281-
"metadata": {
282-
"execution": {
283-
"iopub.execute_input": "2025-07-11T02:27:17.539280Z",
284-
"iopub.status.busy": "2025-07-11T02:27:17.539112Z",
285-
"iopub.status.idle": "2025-07-11T02:27:17.581417Z",
286-
"shell.execute_reply": "2025-07-11T02:27:17.580614Z",
287-
"shell.execute_reply.started": "2025-07-11T02:27:17.539264Z"
288-
}
289-
},
194+
"id": "16",
195+
"metadata": {},
290196
"outputs": [],
291197
"source": [
292198
"struct_col = struct(dedup_df[\"track_id\"],dedup_df[\"sub_path\"],dedup_df[\"hash_html\"],)\n",
@@ -296,16 +202,8 @@
296202
{
297203
"cell_type": "code",
298204
"execution_count": null,
299-
"id": "14",
300-
"metadata": {
301-
"execution": {
302-
"iopub.execute_input": "2025-07-11T02:27:17.582756Z",
303-
"iopub.status.busy": "2025-07-11T02:27:17.582439Z",
304-
"iopub.status.idle": "2025-07-11T02:28:54.423589Z",
305-
"shell.execute_reply": "2025-07-11T02:28:54.422850Z",
306-
"shell.execute_reply.started": "2025-07-11T02:27:17.582737Z"
307-
}
308-
},
205+
"id": "17",
206+
"metadata": {},
309207
"outputs": [],
310208
"source": [
311209
"config[\"skip_output_version\"] = True\n",

0 commit comments

Comments
 (0)