@@ -290,7 +290,7 @@ return scrapy.Request(
290
290
url = " https://example.org" ,
291
291
meta = {
292
292
" playwright" : True ,
293
- " playwright_context" : " persistent " ,
293
+ " playwright_context" : " awesome_context " ,
294
294
},
295
295
)
296
296
```
@@ -307,7 +307,7 @@ return scrapy.Request(
307
307
url = " https://example.org" ,
308
308
meta = {
309
309
" playwright" : True ,
310
- " playwright_context" : " new " ,
310
+ " playwright_context" : " awesome_context " ,
311
311
" playwright_context_kwargs" : {
312
312
" ignore_https_errors" : True ,
313
313
},
@@ -319,15 +319,16 @@ return scrapy.Request(
319
319
Type ` bool ` , default ` False `
320
320
321
321
If ` True ` , the [ Playwright page] ( https://playwright.dev/python/docs/api/class-page )
322
- that was used to download the request will be available in the callback via
323
- ` response.meta['playwright_page'] ` .
322
+ that was used to download the request will be available in the callback at
323
+ ` response.meta['playwright_page'] ` . If ` False ` (or unset) the page will be
324
+ closed immediately after processing the request.
324
325
325
326
** Important!**
326
327
327
328
This meta key is entirely optional, it's NOT necessary for the page to load or for any
328
329
asynchronous operation to be performed (specifically, it's NOT necessary for ` PageMethod `
329
330
objects to be applied). Use it only if you need access to the Page object in the callback
330
- that handles the request .
331
+ that handles the response .
331
332
332
333
For more information and important notes see
333
334
[ Receiving Page objects in callbacks] ( #receiving-page-objects-in-callbacks ) .
@@ -371,8 +372,8 @@ class AwesomeSpider(scrapy.Spider):
371
372
372
373
** Important!**
373
374
374
- ` scrapy-playwright ` uses ` Page.route ` & ` Page.unroute ` internally, please
375
- avoid using these methods unless you know exactly what you're doing.
375
+ ` scrapy-playwright ` uses ` Page.route ` & ` Page.unroute ` internally, avoid using
376
+ these methods unless you know exactly what you're doing.
376
377
377
378
### ` playwright_page_methods `
378
379
Type ` Iterable[PageMethod] ` , default ` () `
@@ -494,7 +495,7 @@ class AwesomeSpiderWithPage(scrapy.Spider):
494
495
* When passing ` playwright_include_page=True ` , make sure pages are always closed
495
496
when they are no longer used. It's recommended to set a Request errback to make
496
497
sure pages are closed even if a request fails (if ` playwright_include_page=False `
497
- or unset, pages are automatically closed upon encountering an exception).
498
+ pages are automatically closed upon encountering an exception).
498
499
This is important, as open pages count towards the limit set by
499
500
` PLAYWRIGHT_MAX_PAGES_PER_CONTEXT ` and crawls could freeze if the limit is reached
500
501
and pages remain open indefinitely.
@@ -575,7 +576,11 @@ def parse(self, response):
575
576
url = " https://example.org" ,
576
577
callback = self .parse_in_new_context,
577
578
errback = self .close_context_on_error,
578
- meta = {" playwright" : True , " playwright_context" : " new" , " playwright_include_page" : True },
579
+ meta = {
580
+ " playwright" : True ,
581
+ " playwright_context" : " awesome_context" ,
582
+ " playwright_include_page" : True ,
583
+ },
579
584
)
580
585
581
586
async def parse_in_new_context (self , response ):
@@ -585,19 +590,17 @@ async def parse_in_new_context(self, response):
585
590
return {" title" : title}
586
591
587
592
async def close_context_on_error (self , failure ):
588
- self .logger.warning(" There was an error when processing %s : %s " , failure.request, failure.value)
589
593
page = failure.request.meta[" playwright_page" ]
590
594
await page.context.close()
591
595
```
592
596
593
597
### Maximum concurrent context count
594
598
595
599
Specify a value for the ` PLAYWRIGHT_MAX_CONTEXTS ` setting to limit the amount
596
- of concurent contexts. This setting should be used with caution: it's possible
597
- to block the whole crawl if contexts are not closed after they are no longer
598
- used (refer to the above section to dinamically close contexts). Make sure to
599
- define an errback to still be able to close the context even if there are
600
- errors with a request.
600
+ of concurent contexts. Use with caution: it's possible to block the whole crawl
601
+ if contexts are not closed after they are no longer used (refer to the above
602
+ section to dinamically close contexts). Make sure to define an errback to still
603
+ close contexts even if there are errors.
601
604
602
605
603
606
## Proxy support
@@ -627,7 +630,7 @@ class ProxySpider(Spider):
627
630
print (response.text)
628
631
```
629
632
630
- You can also set proxies per context with the ` PLAYWRIGHT_CONTEXTS ` setting:
633
+ Proxies can also be set at the context level with the ` PLAYWRIGHT_CONTEXTS ` setting:
631
634
632
635
``` python
633
636
PLAYWRIGHT_CONTEXTS = {
@@ -715,7 +718,7 @@ async def parse(self, response):
715
718
716
719
### Supported methods
717
720
718
- Please refer to the [ upstream docs for the ` Page ` class] ( https://playwright.dev/python/docs/api/class-page )
721
+ Refer to the [ upstream docs for the ` Page ` class] ( https://playwright.dev/python/docs/api/class-page )
719
722
to see available methods.
720
723
721
724
### Impact on Response objects
@@ -761,14 +764,20 @@ class EventSpider(scrapy.Spider):
761
764
logging.info(f " Received response with URL { response.url} " )
762
765
```
763
766
764
- See the [ upstream ` Page ` docs] ( https://playwright.dev/python/docs/api/class-page ) for a list of
765
- the accepted events and the arguments passed to their handlers.
767
+ See the [ upstream ` Page ` docs] ( https://playwright.dev/python/docs/api/class-page )
768
+ for a list of the accepted events and the arguments passed to their handlers.
769
+
770
+ ### Notes about page event handlers
766
771
767
- ** Note** : keep in mind that, unless they are
768
- [ removed later] ( https://playwright.dev/python/docs/events#addingremoving-event-listener ) ,
769
- these handlers will remain attached to the page and will be called for subsequent
770
- downloads using the same page. This is usually not a problem, since by default
771
- requests are performed in single-use pages.
772
+ * Event handlers will remain attached to the page and will be called for
773
+ subsequent downloads using the same page unless they are
774
+ [ removed later] ( https://playwright.dev/python/docs/events#addingremoving-event-listener ) .
775
+ This is usually not a problem, since by default requests are performed in
776
+ single-use pages.
777
+ * Event handlers will process Playwright objects, not Scrapy ones. For example,
778
+ for each Scrapy request/response there will be a matching Playwright
779
+ request/response, but not the other way: background requests/responses to get
780
+ images, scripts, stylesheets, etc are not seen by Scrapy.
772
781
773
782
774
783
## Examples
0 commit comments