|
4 | 4 | "cell_type": "code",
|
5 | 5 | "execution_count": null,
|
6 | 6 | "id": "0",
|
7 |
| - "metadata": { |
8 |
| - "execution": { |
9 |
| - "iopub.status.idle": "2025-07-11T02:26:43.748017Z", |
10 |
| - "shell.execute_reply": "2025-07-11T02:26:43.747366Z", |
11 |
| - "shell.execute_reply.started": "2025-07-11T02:26:34.645411Z" |
12 |
| - } |
13 |
| - }, |
| 7 | + "metadata": {}, |
14 | 8 | "outputs": [],
|
15 | 9 | "source": [
|
16 | 10 | "from xinghe.spark import *\n",
|
|
50 | 44 | "cell_type": "code",
|
51 | 45 | "execution_count": null,
|
52 | 46 | "id": "2",
|
53 |
| - "metadata": { |
54 |
| - "execution": { |
55 |
| - "iopub.execute_input": "2025-07-11T02:26:43.749268Z", |
56 |
| - "iopub.status.busy": "2025-07-11T02:26:43.748929Z", |
57 |
| - "iopub.status.idle": "2025-07-11T02:26:43.751756Z", |
58 |
| - "shell.execute_reply": "2025-07-11T02:26:43.751304Z", |
59 |
| - "shell.execute_reply.started": "2025-07-11T02:26:43.749250Z" |
60 |
| - } |
61 |
| - }, |
| 47 | + "metadata": {}, |
62 | 48 | "outputs": [],
|
63 | 49 | "source": [
|
64 | 50 | "#dump_paths = []\n",
|
|
71 | 57 | "cell_type": "code",
|
72 | 58 | "execution_count": null,
|
73 | 59 | "id": "3",
|
74 |
| - "metadata": { |
75 |
| - "execution": { |
76 |
| - "iopub.execute_input": "2025-07-11T02:26:43.752632Z", |
77 |
| - "iopub.status.busy": "2025-07-11T02:26:43.752346Z", |
78 |
| - "iopub.status.idle": "2025-07-11T02:26:43.762655Z", |
79 |
| - "shell.execute_reply": "2025-07-11T02:26:43.762229Z", |
80 |
| - "shell.execute_reply.started": "2025-07-11T02:26:43.752617Z" |
81 |
| - } |
82 |
| - }, |
| 60 | + "metadata": {}, |
83 | 61 | "outputs": [],
|
84 | 62 | "source": [
|
85 | 63 | "#input_df = spark.read.format(\"json\").load(dump_paths)"
|
|
97 | 75 | "cell_type": "code",
|
98 | 76 | "execution_count": null,
|
99 | 77 | "id": "5",
|
100 |
| - "metadata": { |
101 |
| - "execution": { |
102 |
| - "iopub.execute_input": "2025-07-11T02:26:43.763518Z", |
103 |
| - "iopub.status.busy": "2025-07-11T02:26:43.763240Z", |
104 |
| - "iopub.status.idle": "2025-07-11T02:26:43.770017Z", |
105 |
| - "shell.execute_reply": "2025-07-11T02:26:43.769479Z", |
106 |
| - "shell.execute_reply.started": "2025-07-11T02:26:43.763503Z" |
107 |
| - } |
108 |
| - }, |
| 78 | + "metadata": {}, |
109 | 79 | "outputs": [],
|
110 | 80 | "source": [
|
111 | 81 | "#already_exist_id_v_df = read_any_path(spark, already_exist_id_path, config)"
|
|
115 | 85 | "cell_type": "code",
|
116 | 86 | "execution_count": null,
|
117 | 87 | "id": "6",
|
118 |
| - "metadata": { |
119 |
| - "execution": { |
120 |
| - "iopub.execute_input": "2025-07-11T02:26:43.772007Z", |
121 |
| - "iopub.status.busy": "2025-07-11T02:26:43.771594Z", |
122 |
| - "iopub.status.idle": "2025-07-11T02:26:43.776268Z", |
123 |
| - "shell.execute_reply": "2025-07-11T02:26:43.775843Z", |
124 |
| - "shell.execute_reply.started": "2025-07-11T02:26:43.771991Z" |
125 |
| - } |
126 |
| - }, |
| 88 | + "metadata": {}, |
127 | 89 | "outputs": [],
|
128 | 90 | "source": [
|
129 | 91 | "# 定义 Schema\n",
|
|
147 | 109 | "cell_type": "code",
|
148 | 110 | "execution_count": null,
|
149 | 111 | "id": "8",
|
150 |
| - "metadata": { |
151 |
| - "execution": { |
152 |
| - "iopub.execute_input": "2025-07-11T02:26:43.776944Z", |
153 |
| - "iopub.status.busy": "2025-07-11T02:26:43.776805Z", |
154 |
| - "iopub.status.idle": "2025-07-11T02:26:43.784766Z", |
155 |
| - "shell.execute_reply": "2025-07-11T02:26:43.784336Z", |
156 |
| - "shell.execute_reply.started": "2025-07-11T02:26:43.776931Z" |
157 |
| - } |
158 |
| - }, |
| 112 | + "metadata": {}, |
159 | 113 | "outputs": [],
|
160 | 114 | "source": [
|
161 | 115 | "#join_df = input_df.join(already_exist_id_df, on=\"hash_html\", how=\"left\")\n",
|
|
173 | 127 | {
|
174 | 128 | "cell_type": "code",
|
175 | 129 | "execution_count": null,
|
176 |
| - "id": "da837b3e-5eee-43b1-9c77-836aac1203df", |
177 |
| - "metadata": { |
178 |
| - "execution": { |
179 |
| - "iopub.execute_input": "2025-07-11T02:26:43.785611Z", |
180 |
| - "iopub.status.busy": "2025-07-11T02:26:43.785331Z", |
181 |
| - "iopub.status.idle": "2025-07-11T02:27:13.606489Z", |
182 |
| - "shell.execute_reply": "2025-07-11T02:27:13.605744Z", |
183 |
| - "shell.execute_reply.started": "2025-07-11T02:26:43.785596Z" |
184 |
| - } |
185 |
| - }, |
| 130 | + "id": "10", |
| 131 | + "metadata": {}, |
186 | 132 | "outputs": [],
|
187 | 133 | "source": [
|
188 | 134 | "input_df = read_any_path(spark, \",\".join(base_input_path), config)"
|
|
191 | 137 | {
|
192 | 138 | "cell_type": "code",
|
193 | 139 | "execution_count": null,
|
194 |
| - "id": "20135ae2-b1d0-43c9-b241-8d23de94f01d", |
195 |
| - "metadata": { |
196 |
| - "execution": { |
197 |
| - "iopub.execute_input": "2025-07-11T02:27:13.608658Z", |
198 |
| - "iopub.status.busy": "2025-07-11T02:27:13.607902Z", |
199 |
| - "iopub.status.idle": "2025-07-11T02:27:13.612633Z", |
200 |
| - "shell.execute_reply": "2025-07-11T02:27:13.611707Z", |
201 |
| - "shell.execute_reply.started": "2025-07-11T02:27:13.608633Z" |
202 |
| - } |
203 |
| - }, |
| 140 | + "id": "11", |
| 141 | + "metadata": {}, |
204 | 142 | "outputs": [],
|
205 | 143 | "source": [
|
206 | 144 | "def json_data(row_iter)->Row:\n",
|
|
212 | 150 | {
|
213 | 151 | "cell_type": "code",
|
214 | 152 | "execution_count": null,
|
215 |
| - "id": "0c0435c3-bd5e-4571-ab29-90c522023dec", |
216 |
| - "metadata": { |
217 |
| - "execution": { |
218 |
| - "iopub.execute_input": "2025-07-11T02:27:13.614260Z", |
219 |
| - "iopub.status.busy": "2025-07-11T02:27:13.613866Z", |
220 |
| - "iopub.status.idle": "2025-07-11T02:27:17.465595Z", |
221 |
| - "shell.execute_reply": "2025-07-11T02:27:17.464788Z", |
222 |
| - "shell.execute_reply.started": "2025-07-11T02:27:13.614237Z" |
223 |
| - } |
224 |
| - }, |
| 153 | + "id": "12", |
| 154 | + "metadata": {}, |
225 | 155 | "outputs": [],
|
226 | 156 | "source": [
|
227 | 157 | "undedup_id_df = input_df.rdd.mapPartitions(json_data).toDF()"
|
|
230 | 160 | {
|
231 | 161 | "cell_type": "code",
|
232 | 162 | "execution_count": null,
|
233 |
| - "id": "10", |
234 |
| - "metadata": { |
235 |
| - "execution": { |
236 |
| - "iopub.execute_input": "2025-07-11T02:27:17.466995Z", |
237 |
| - "iopub.status.busy": "2025-07-11T02:27:17.466683Z", |
238 |
| - "iopub.status.idle": "2025-07-11T02:27:17.526987Z", |
239 |
| - "shell.execute_reply": "2025-07-11T02:27:17.526167Z", |
240 |
| - "shell.execute_reply.started": "2025-07-11T02:27:17.466977Z" |
241 |
| - } |
242 |
| - }, |
| 163 | + "id": "13", |
| 164 | + "metadata": {}, |
243 | 165 | "outputs": [],
|
244 | 166 | "source": [
|
245 | 167 | "def deduplicate_partition(partition):\n",
|
|
251 | 173 | {
|
252 | 174 | "cell_type": "code",
|
253 | 175 | "execution_count": null,
|
254 |
| - "id": "11", |
255 |
| - "metadata": { |
256 |
| - "execution": { |
257 |
| - "iopub.execute_input": "2025-07-11T02:27:17.528440Z", |
258 |
| - "iopub.status.busy": "2025-07-11T02:27:17.528025Z", |
259 |
| - "iopub.status.idle": "2025-07-11T02:27:17.537970Z", |
260 |
| - "shell.execute_reply": "2025-07-11T02:27:17.537362Z", |
261 |
| - "shell.execute_reply.started": "2025-07-11T02:27:17.528418Z" |
262 |
| - } |
263 |
| - }, |
| 176 | + "id": "14", |
| 177 | + "metadata": {}, |
264 | 178 | "outputs": [],
|
265 | 179 | "source": [
|
266 | 180 | "dedup_df = dedup_part_df.dropDuplicates([\"hash_html\"])"
|
267 | 181 | ]
|
268 | 182 | },
|
269 | 183 | {
|
270 | 184 | "cell_type": "markdown",
|
271 |
| - "id": "12", |
| 185 | + "id": "15", |
272 | 186 | "metadata": {},
|
273 | 187 | "source": [
|
274 | 188 | "# 写出s3"
|
|
277 | 191 | {
|
278 | 192 | "cell_type": "code",
|
279 | 193 | "execution_count": null,
|
280 |
| - "id": "13", |
281 |
| - "metadata": { |
282 |
| - "execution": { |
283 |
| - "iopub.execute_input": "2025-07-11T02:27:17.539280Z", |
284 |
| - "iopub.status.busy": "2025-07-11T02:27:17.539112Z", |
285 |
| - "iopub.status.idle": "2025-07-11T02:27:17.581417Z", |
286 |
| - "shell.execute_reply": "2025-07-11T02:27:17.580614Z", |
287 |
| - "shell.execute_reply.started": "2025-07-11T02:27:17.539264Z" |
288 |
| - } |
289 |
| - }, |
| 194 | + "id": "16", |
| 195 | + "metadata": {}, |
290 | 196 | "outputs": [],
|
291 | 197 | "source": [
|
292 | 198 | "struct_col = struct(dedup_df[\"track_id\"],dedup_df[\"sub_path\"],dedup_df[\"hash_html\"],)\n",
|
|
296 | 202 | {
|
297 | 203 | "cell_type": "code",
|
298 | 204 | "execution_count": null,
|
299 |
| - "id": "14", |
300 |
| - "metadata": { |
301 |
| - "execution": { |
302 |
| - "iopub.execute_input": "2025-07-11T02:27:17.582756Z", |
303 |
| - "iopub.status.busy": "2025-07-11T02:27:17.582439Z", |
304 |
| - "iopub.status.idle": "2025-07-11T02:28:54.423589Z", |
305 |
| - "shell.execute_reply": "2025-07-11T02:28:54.422850Z", |
306 |
| - "shell.execute_reply.started": "2025-07-11T02:27:17.582737Z" |
307 |
| - } |
308 |
| - }, |
| 205 | + "id": "17", |
| 206 | + "metadata": {}, |
309 | 207 | "outputs": [],
|
310 | 208 | "source": [
|
311 | 209 | "config[\"skip_output_version\"] = True\n",
|
|
0 commit comments