Module scrapy_patterns.spiderlings.site_structure_discoverer
Contains the site structure discoverer spiderling.
Expand source code
"""Contains the site structure discoverer spiderling."""
import logging
from typing import List, Tuple, Callable, Optional
from scrapy import Spider, Request
from scrapy.http import Response
from scrapy_patterns.request_factory import RequestFactory
from scrapy_patterns.site_structure import SiteStructure
class CategoryParser:
"""Interface used for parsing categories."""
def parse(self, response) -> List[Tuple[str, str]]:
"""
Parses categories from the response.
Args:
response: The response
Returns: List of tuples, where the first element is the URL of the category, and the second is the name.
"""
raise NotImplementedError()
class SiteStructureDiscoverer:
"""Discovers the site structure."""
# pylint: disable=too-many-arguments, too-many-instance-attributes
def __init__(self, spider: Spider, start_url: str, category_parsers: List[CategoryParser],
request_factory: RequestFactory,
on_discovery_complete: Callable[['SiteStructureDiscoverer'], Optional[Request]] = None):
"""
Args:
spider: The spider to which this belongs.
start_url: Starting URL of categories.
category_parsers: List of category parsers for each level of categories. The last element in the list should
parse the leaf categories.
request_factory: The request factory.
on_discovery_complete: An optional callback when the discovery is complete. It'll receive this discoverer
as its argument. It should return a scrapy request to continue the scraping with.
"""
self.logger = logging.getLogger(self.__class__.__name__)
self.name = spider.name # Needed to conform to Scrapy Spiders.
self.structure = SiteStructure(self.name)
self.__start_url = start_url
self.__category_parsers = category_parsers
self.__request_factory = request_factory
self.__remaining_work = 0
self.__on_discovery_complete = on_discovery_complete if on_discovery_complete else self.__do_nothing
def create_start_request(self):
"""
Creates the starting request.
Returns: The starting request.
"""
self.__remaining_work += 1
return self.__request_factory.create(self.__start_url, self.__process_category_response,
cb_kwargs={"category_index": 0, "path": None})
def __process_category_response(self, response, category_index: int, path: str):
self.__remaining_work -= 1
category_parser = self.__category_parsers[category_index]
urls_and_names = self.__get_urls_and_names(response, category_parser)
requests = self.__prepare_requests(urls_and_names, path, category_index)
self.__remaining_work += len(requests)
self.logger.info("[%s] Remaining work(s): %d", self.name, self.__remaining_work)
if self.__remaining_work == 0:
self.logger.info("[%s] Discovery complete.\n"
"%s", self.name, str(self.structure))
yield self.__on_discovery_complete(self)
for req in requests:
yield req
@staticmethod
def __get_urls_and_names(response: Response, category_parser: CategoryParser):
return category_parser.parse(response)
@staticmethod
def __do_nothing(_):
return None
def __prepare_requests(self, urls_and_names: List[Tuple[str, str]], current_path: str, category_index: int):
requests = []
for url, name in urls_and_names:
structure_path = self.__determine_structure_path(current_path, name)
is_added = self.__try_add_path(structure_path, url)
if is_added:
self.__append_to_requests_if_not_finished(category_index, requests, (url, structure_path))
return requests
@staticmethod
def __determine_structure_path(current_path, name):
if current_path is None:
return name
else:
return current_path + "/" + name
def __try_add_path(self, path: str, url: str) -> bool:
if self.structure.get_node_at_path(path) is not None:
self.logger.warning("Path \"path\" already exists; path to add is ignored!")
return False
else:
self.structure.add_node_with_path(path, url)
return True
def __append_to_requests_if_not_finished(self, category_index: int, requests: List[Request],
url_and_path: Tuple[str, str]):
if category_index + 1 < len(self.__category_parsers):
request = self.__request_factory.create(
url_and_path[0], self.__process_category_response,
cb_kwargs={"category_index": category_index + 1, "path": url_and_path[1]})
requests.append(request)
Classes
class CategoryParser
-
Interface used for parsing categories.
Expand source code
class CategoryParser: """Interface used for parsing categories.""" def parse(self, response) -> List[Tuple[str, str]]: """ Parses categories from the response. Args: response: The response Returns: List of tuples, where the first element is the URL of the category, and the second is the name. """ raise NotImplementedError()
Methods
def parse(self, response) ‑> List[Tuple[str, str]]
-
Parses categories from the response.
Args
response
- The response
Returns: List of tuples, where the first element is the URL of the category, and the second is the name.
Expand source code
def parse(self, response) -> List[Tuple[str, str]]: """ Parses categories from the response. Args: response: The response Returns: List of tuples, where the first element is the URL of the category, and the second is the name. """ raise NotImplementedError()
class SiteStructureDiscoverer (spider: scrapy.spiders.Spider, start_url: str, category_parsers: List[CategoryParser], request_factory: RequestFactory, on_discovery_complete: Callable[[_ForwardRef('SiteStructureDiscoverer')], Union[scrapy.http.request.Request, NoneType]] = None)
-
Discovers the site structure.
Args
spider
- The spider to which this belongs.
start_url
- Starting URL of categories.
category_parsers
- List of category parsers for each level of categories. The last element in the list should parse the leaf categories.
request_factory
- The request factory.
on_discovery_complete
- An optional callback when the discovery is complete. It'll receive this discoverer
as its argument. It should return a scrapy request to continue the scraping with.
Expand source code
class SiteStructureDiscoverer: """Discovers the site structure.""" # pylint: disable=too-many-arguments, too-many-instance-attributes def __init__(self, spider: Spider, start_url: str, category_parsers: List[CategoryParser], request_factory: RequestFactory, on_discovery_complete: Callable[['SiteStructureDiscoverer'], Optional[Request]] = None): """ Args: spider: The spider to which this belongs. start_url: Starting URL of categories. category_parsers: List of category parsers for each level of categories. The last element in the list should parse the leaf categories. request_factory: The request factory. on_discovery_complete: An optional callback when the discovery is complete. It'll receive this discoverer as its argument. It should return a scrapy request to continue the scraping with. """ self.logger = logging.getLogger(self.__class__.__name__) self.name = spider.name # Needed to conform to Scrapy Spiders. self.structure = SiteStructure(self.name) self.__start_url = start_url self.__category_parsers = category_parsers self.__request_factory = request_factory self.__remaining_work = 0 self.__on_discovery_complete = on_discovery_complete if on_discovery_complete else self.__do_nothing def create_start_request(self): """ Creates the starting request. Returns: The starting request. """ self.__remaining_work += 1 return self.__request_factory.create(self.__start_url, self.__process_category_response, cb_kwargs={"category_index": 0, "path": None}) def __process_category_response(self, response, category_index: int, path: str): self.__remaining_work -= 1 category_parser = self.__category_parsers[category_index] urls_and_names = self.__get_urls_and_names(response, category_parser) requests = self.__prepare_requests(urls_and_names, path, category_index) self.__remaining_work += len(requests) self.logger.info("[%s] Remaining work(s): %d", self.name, self.__remaining_work) if self.__remaining_work == 0: self.logger.info("[%s] Discovery complete.\n" "%s", self.name, str(self.structure)) yield self.__on_discovery_complete(self) for req in requests: yield req @staticmethod def __get_urls_and_names(response: Response, category_parser: CategoryParser): return category_parser.parse(response) @staticmethod def __do_nothing(_): return None def __prepare_requests(self, urls_and_names: List[Tuple[str, str]], current_path: str, category_index: int): requests = [] for url, name in urls_and_names: structure_path = self.__determine_structure_path(current_path, name) is_added = self.__try_add_path(structure_path, url) if is_added: self.__append_to_requests_if_not_finished(category_index, requests, (url, structure_path)) return requests @staticmethod def __determine_structure_path(current_path, name): if current_path is None: return name else: return current_path + "/" + name def __try_add_path(self, path: str, url: str) -> bool: if self.structure.get_node_at_path(path) is not None: self.logger.warning("Path \"path\" already exists; path to add is ignored!") return False else: self.structure.add_node_with_path(path, url) return True def __append_to_requests_if_not_finished(self, category_index: int, requests: List[Request], url_and_path: Tuple[str, str]): if category_index + 1 < len(self.__category_parsers): request = self.__request_factory.create( url_and_path[0], self.__process_category_response, cb_kwargs={"category_index": category_index + 1, "path": url_and_path[1]}) requests.append(request)
Methods
def create_start_request(self)
-
Creates the starting request. Returns: The starting request.
Expand source code
def create_start_request(self): """ Creates the starting request. Returns: The starting request. """ self.__remaining_work += 1 return self.__request_factory.create(self.__start_url, self.__process_category_response, cb_kwargs={"category_index": 0, "path": None})