Skip to content

Handle downloads as binary responses #228

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Oct 19, 2023
2 changes: 2 additions & 0 deletions examples/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.png
*.pdf
31 changes: 31 additions & 0 deletions examples/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pathlib import Path

from scrapy import Spider, Request


class DownloadSpider(Spider):
name = "download"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
}

def start_requests(self):
yield Request(url="https://example.org", meta={"playwright": True})
yield Request(
url="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
meta={"playwright": True},
)

def parse(self, response):
if filename := response.meta.get("playwright_suggested_filename"):
(Path(__file__).parent / filename).write_bytes(response.body)
yield {
"url": response.url,
"response_cls": response.__class__.__name__,
"first_bytes": response.body[:60],
"filename": filename,
}
1 change: 1 addition & 0 deletions pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ disable=
duplicate-code,
import-outside-toplevel,
protected-access,
too-many-public-methods,
unnecessary-dunder-call,


Expand Down
94 changes: 73 additions & 21 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
from contextlib import suppress
from dataclasses import dataclass
from ipaddress import ip_address
from tempfile import NamedTemporaryFile
from time import time
from typing import Awaitable, Callable, Dict, Optional, Type, TypeVar, Union
from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union

from playwright.async_api import (
BrowserContext,
BrowserType,
Download,
Error as PlaywrightError,
Page,
PlaywrightContextManager,
Expand Down Expand Up @@ -319,7 +321,7 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
)

try:
result = await self._download_request_with_page(request, page, spider)
return await self._download_request_with_page(request, page, spider)
except Exception as ex:
if not request.meta.get("playwright_include_page") and not page.is_closed():
logger.warning(
Expand All @@ -339,8 +341,6 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
await page.close()
self.stats.inc_value("playwright/page_count/closed")
raise
else:
return result

async def _download_request_with_page(
self, request: Request, page: Page, spider: Spider
Expand All @@ -349,51 +349,61 @@ async def _download_request_with_page(
if request.meta.get("playwright_include_page"):
request.meta["playwright_page"] = page

context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)

start_time = time()
page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
page_goto_kwargs.pop("url", None)
response = await page.goto(url=request.url, **page_goto_kwargs)
if response is None:
response, download = await self._get_response_and_download(request=request, page=page)
if isinstance(response, PlaywrightResponse):
await _set_redirect_meta(request=request, response=response)
headers = Headers(await response.all_headers())
headers.pop("Content-Encoding", None)
else:
logger.warning(
"Navigating to %s returned None, the response"
" will have empty headers and status 200",
request,
extra={
"spider": spider,
"context_name": context_name,
"context_name": request.meta.get("playwright_context"),
"scrapy_request_url": request.url,
"scrapy_request_method": request.method,
},
)
headers = Headers()
else:
await _set_redirect_meta(request=request, response=response)
headers = Headers(await response.all_headers())
headers.pop("Content-Encoding", None)

await self._apply_page_methods(page, request, spider)
body_str = await _get_page_content(
page=page,
spider=spider,
context_name=context_name,
context_name=request.meta.get("playwright_context"),
scrapy_request_url=request.url,
scrapy_request_method=request.method,
)
request.meta["download_latency"] = time() - start_time

server_ip_address = None
with suppress(AttributeError, KeyError, TypeError, ValueError):
server_addr = await response.server_addr()
server_ip_address = ip_address(server_addr["ipAddress"])

with suppress(AttributeError):
if response is not None:
request.meta["playwright_security_details"] = await response.security_details()
with suppress(KeyError, TypeError, ValueError):
server_addr = await response.server_addr()
server_ip_address = ip_address(server_addr["ipAddress"])

if download.get("exception"):
raise download["exception"]

if not request.meta.get("playwright_include_page"):
await page.close()
self.stats.inc_value("playwright/page_count/closed")

if download:
request.meta["playwright_suggested_filename"] = download.get("suggested_filename")
respcls = responsetypes.from_args(url=download["url"], body=download["bytes"])
return respcls(
url=download["url"],
status=200,
body=download["bytes"],
request=request,
flags=["playwright"],
)

body, encoding = _encode_body(headers=headers, text=body_str)
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
return respcls(
Expand All @@ -407,6 +417,48 @@ async def _download_request_with_page(
ip_address=server_ip_address,
)

async def _get_response_and_download(
self, request: Request, page: Page
) -> Tuple[Optional[PlaywrightResponse], dict]:
response: Optional[PlaywrightResponse] = None
download: dict = {} # updated in-place in _handle_download
download_ready = asyncio.Event()

async def _handle_download(dwnld: Download) -> None:
self.stats.inc_value("playwright/download_count")
try:
if failure := await dwnld.failure():
raise RuntimeError(f"Failed to download {dwnld.url}: {failure}")
with NamedTemporaryFile() as temp_file:
await dwnld.save_as(temp_file.name)
temp_file.seek(0)
download["bytes"] = temp_file.read()
download["url"] = dwnld.url
download["suggested_filename"] = dwnld.suggested_filename
except Exception as ex:
download["exception"] = ex
finally:
download_ready.set()

page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
page_goto_kwargs.pop("url", None)
page.on("download", _handle_download)
try:
response = await page.goto(url=request.url, **page_goto_kwargs)
except PlaywrightError as err:
if not (
self.browser_type_name in ("firefox", "webkit")
and "Download is starting" in err.message
or self.browser_type_name == "chromium"
and "net::ERR_ABORTED" in err.message
):
raise
await download_ready.wait()
finally:
page.remove_listener("download", _handle_download)

return response, download

async def _apply_page_methods(self, page: Page, request: Request, spider: Spider) -> None:
context_name = request.meta.get("playwright_context")
page_methods = request.meta.get("playwright_page_methods") or ()
Expand Down
15 changes: 15 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
def pytest_sessionstart(session): # pylint: disable=unused-argument
"""
Called after the Session object has been created and before performing
collection and entering the run test loop.
"""
from twisted.internet.asyncioreactor import install, AsyncioSelectorReactor
from twisted.internet.error import ReactorAlreadyInstalledError

try:
install()
except ReactorAlreadyInstalledError as exc:
from twisted.internet import reactor

if not isinstance(reactor, AsyncioSelectorReactor):
raise RuntimeError(f"Wrong reactor installed: {type(reactor)}") from exc
46 changes: 29 additions & 17 deletions tests/mockserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from subprocess import Popen, PIPE
from threading import Thread
from typing import Optional
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse, parse_qs


class StaticMockServer:
Expand Down Expand Up @@ -42,39 +42,51 @@ def urljoin(self, url):
class _RequestHandler(BaseHTTPRequestHandler):
def do_POST(self) -> None:
"""Echo back the request body"""
content_length = int(self.headers["Content-Length"])
body = self.rfile.read(content_length)
content_length = int(self.headers.get("Content-Length") or 0)
body_bytes = b"Request body: " + self.rfile.read(content_length)
self.send_response(200)
self.send_header("Content-Length", str(len(body_bytes)))
self.end_headers()
self.wfile.write(b"Request body: ")
self.wfile.write(body)
self.wfile.write(body_bytes)

def do_GET(self) -> None:
if self.path == "/headers":
parsed_path = urlparse(self.path)
query_string = {key: values[0] for key, values in parse_qs(parsed_path.query).items()}

if delay := int(query_string.get("delay") or 0):
print(f"Sleeping {delay} seconds on path {parsed_path.path}...")
time.sleep(delay)

if parsed_path.path == "/headers":
self._send_json(dict(self.headers))
elif self.path == "/redirect2":
elif parsed_path.path == "/redirect2":
self.send_response(302)
self.send_header("Content-Length", "0")
self.send_header("Location", "/redirect")
self.end_headers()
elif self.path == "/redirect":
elif parsed_path.path == "/redirect":
self.send_response(301)
self.send_header("Content-Length", "0")
self.send_header("Location", "/headers")
self.end_headers()
elif parsed_path.path == "/mancha.pdf":
body_bytes = (Path(__file__).absolute().parent / "site/files/mancha.pdf").read_bytes()
content_length_multiplier = int(query_string.get("content_length_multiplier") or 1)
self.send_response(200)
self.send_header("Content-Type", "application/pdf")
self.send_header("Content-Disposition", 'attachment; filename="mancha.pdf"')
self.send_header("Content-Length", str(len(body_bytes) * content_length_multiplier))
self.end_headers()
self.wfile.write(body_bytes)
else:
delay_match = re.match(r"^/delay/(\d+)$", self.path)
if delay_match:
delay = int(delay_match.group(1))
print(f"Sleeping {delay} seconds...")
time.sleep(delay)
self._send_json({"delay": delay})
else:
self._send_json({"error": "unknown path"}, status=400)
self._send_json({"error": "unknown path"}, status=404)

def _send_json(self, body: dict, status: int = 200) -> None:
body_bytes = json.dumps(body, indent=2).encode("utf8")
self.send_response(status)
self.send_header("Content-Length", str(len(body_bytes)))
self.send_header("Content-Type", "application/json")
self.end_headers()
body_bytes = json.dumps(body, indent=4).encode("utf8")
self.wfile.write(body_bytes)


Expand Down
Binary file added tests/site/files/mancha.pdf
Binary file not shown.
Loading