Module scrapy_patterns.site_structure

Contains classes that are used to describe the structure of a site.

Expand source code
"""Contains classes that are used to describe the structure of a site."""
from enum import Enum
from typing import Optional, List, Union


class VisitState(Enum):
    """Visit state of a category (node)"""
    NEW = 0
    IN_PROGRESS = 1
    VISITED = 2


class Node:
    """
    The node (category). Most sites are built around categories, which in turn can contain sub-categories, etc...
    Therefore the structure of a site is represented as a tree.
    Attributes:
        name (str): The name of the node.
        url (str): The url at which the node (category) is available.
        parent (Node): The parent of the node.
        visit_state (VisitState): The visit state of the node. Default value is VisitState.NEW
        children (List[Node]): Children of node. Default value is an empty list.
    """

    def __init__(self, name: str, url: str, parent: 'Node' = None):
        """
        Args:
            name: The name of the node
            url: The url at which the node (category) is available.
            parent: The node's parent.
        """
        self.name = name
        self.url = url
        self.visit_state = VisitState.NEW
        self.children: List[Node] = []
        self.parent: Node = parent

    def get_path(self) -> str:
        """
        Returns: The path of the node separated by slashes (/), excluding the name of the root node. Paths consists of
        the names of the nodes.
        """
        if self.parent is None:
            return ""
        else:
            return self.parent.get_path() + "/" + self.name

    def to_dict(self):
        """
        Returns: A dict representation of the tree rooted at this node.
        """
        node_dict = {"name": self.name, "url": self.url, "visit_state": self.visit_state.name, "children": []}
        if self.children:
            children = [node.to_dict() for node in self.children]
            node_dict.update({"children": children})
        return node_dict

    @classmethod
    def from_dict(cls, node_dict: dict):
        """
        Construct a tree from its dict representation.
        Args:
            node_dict: The dict representation.

        Returns: The restored tree.
        """
        name = node_dict["name"]
        url = node_dict["url"]
        visit_state = VisitState[node_dict["visit_state"]]
        node = Node(name, url)
        node.visit_state = visit_state
        if node_dict["children"]:
            for child_dict in node_dict["children"]:
                child_node = Node.from_dict(child_dict)
                child_node.parent = node
                node.children.append(child_node)
        return node

    def set_visit_state(self, visit_state: VisitState, propagate: bool = False):
        """
        Sets the visit state of this node optionally propagating it to ancestors.
        Args:
            visit_state: The new state.
            propagate: Whether to propagate to ancestors.
        """
        self.visit_state = visit_state
        if self.parent and propagate:
            self.parent.set_visit_state(visit_state, propagate)


class SiteStructure:
    """
    Handles the nodes of the structure.
    Attributes:
        root_node (Node): The root node.
    """

    def __init__(self, name=""):
        """
        Creates the structure with a root node initialized to the given name prefixed with '(root) '.
        Args:
            name: The name of the root node. (The root node doesn't have an url)
        """
        self.root_node = Node("(root) {}".format(name), "")

    def add_node_with_path(self, path: str, url: str):
        """
        Adds a new node with url under path, where the name of the new node will be the last part of the path.
        Nodes along the path should pre-exists except for the last node.
        Args:
            path (str): The path.
            url (str): The url.

        Returns: The newly created node.
        """
        if self.get_node_at_path(path) is not None:
            raise RuntimeError("Path \"{}\" already exists!".format(path))
        used_path = path.strip("/")
        node_names = self.__split_path(used_path)
        children = self.root_node.children
        parent = self.root_node
        if len(node_names) > 1:
            parent_path = "/".join(node_names[:-1])
            existing = self.get_node_at_path(parent_path)
            if existing:
                parent = existing
                children = existing.children
            else:
                raise RuntimeError("Parent path \"{}\" not existing!".format(parent_path))
        new_node_name = node_names[-1]
        node = Node(new_node_name, url, parent)
        children.append(node)
        return node

    def get_node_at_path(self, path: str) -> Node:
        """
        Gets a node at path.
        Args:
            path (str): The path

        Returns: The node if found, else None.
        """
        children = self.root_node.children
        result_node = None
        used_path = path.strip("/")
        for node_name in self.__split_path(used_path):
            result_node = self.__find_node(node_name, children)
            if result_node is None:
                break
            children = result_node.children
        return result_node

    def to_dict(self):
        """
        Returns: The structure as a dict.
        """
        return self.root_node.to_dict()

    @classmethod
    def from_dict(cls, struct_dict):
        """
        Creates a structure from its dict representation.
        Args:
            struct_dict: The dict.

        Returns: The site structure.
        """
        structure = SiteStructure()
        structure.root_node = Node.from_dict(struct_dict)
        return structure

    def find_leaf_with_visit_state(self, visit_state: Union[VisitState, List[VisitState]]) -> Optional[Node]:
        """
        Finds a leaf node with matching visit state(s). Search is DFS.
        Args:
            visit_state: Either a VisitState, or list of VisitStates. If a list, a match is found when any of the
            list's element matches.

        Returns: The first matching node if found, else None.

        """
        if visit_state is None:
            raise TypeError("Visit state cannot be none")
        if isinstance(visit_state, list) and not visit_state:
            raise ValueError("Visit states is empty!")
        return self.__find_leaf_with_visit_state(visit_state, self.root_node)

    def __str__(self):
        return "\n".join(self.__create_log_msg_records(self.root_node))

    def __find_leaf_with_visit_state(self, visit_state: Union[VisitState, List[VisitState]], node: Node):
        is_leaf = len(node.children) == 0
        if is_leaf and self.__visit_state_matches(visit_state, node):
            return node
        for child in node.children:
            descendant_match = self.__find_leaf_with_visit_state(visit_state, child)
            if descendant_match:
                return descendant_match
        return None

    @staticmethod
    def __visit_state_matches(visit_state, node):
        if isinstance(visit_state, list):
            return node.visit_state in visit_state
        else:
            return node.visit_state == visit_state

    @staticmethod
    def __find_node(name, nodes):
        for node in nodes:
            if node.name == name:
                return node
        return None

    @staticmethod
    def __split_path(path):
        return path.split("/")

    def __create_log_msg_records(self, node: Node, prefix=""):
        records = []
        is_root = node.parent is None
        has_sibling = self.__has_sibling(node)
        node_prefix = self.__create_node_prefix(is_root, has_sibling)
        node_prefix = prefix + node_prefix
        log_msg_record = self.__create_single_log_msg_record(node, node_prefix)
        records.append(log_msg_record)
        carry_on_prefix = self.__create_carry_on_prefix(is_root, has_sibling)
        carry_on_prefix = prefix + carry_on_prefix
        for child in node.children:
            records.extend(self.__create_log_msg_records(child, carry_on_prefix))
        return records

    @staticmethod
    def __create_single_log_msg_record(node, node_prefix):
        is_root = node.parent is None
        if is_root:
            return node.name
        else:
            return "{node_prefix}[{visit_state}] {node_name} ({node_url})".format(
                node_prefix=node_prefix, visit_state=node.visit_state.name, node_name=node.name, node_url=node.url)

    @staticmethod
    def __has_sibling(child):
        if child and child.parent:
            return child.parent.children[-1].name != child.name
        return None

    @staticmethod
    def __create_node_prefix(is_root, has_sibling):
        if is_root:
            return ""
        if has_sibling:
            return "├── "
        else:
            return "└── "

    @staticmethod
    def __create_carry_on_prefix(is_root, has_sibling):
        if is_root:
            return ""
        if has_sibling:
            return "|   "
        else:
            return "    "

Classes

class Node (name: str, url: str, parent: Node = None)

The node (category). Most sites are built around categories, which in turn can contain sub-categories, etc… Therefore the structure of a site is represented as a tree.

Attributes

name : str
The name of the node.
url : str
The url at which the node (category) is available.
parent : Node
The parent of the node.
visit_state : VisitState
The visit state of the node. Default value is VisitState.NEW
children : List[Node]
Children of node. Default value is an empty list.

Args

name
The name of the node
url
The url at which the node (category) is available.
parent
The node's parent.
Expand source code
class Node:
    """
    The node (category). Most sites are built around categories, which in turn can contain sub-categories, etc...
    Therefore the structure of a site is represented as a tree.
    Attributes:
        name (str): The name of the node.
        url (str): The url at which the node (category) is available.
        parent (Node): The parent of the node.
        visit_state (VisitState): The visit state of the node. Default value is VisitState.NEW
        children (List[Node]): Children of node. Default value is an empty list.
    """

    def __init__(self, name: str, url: str, parent: 'Node' = None):
        """
        Args:
            name: The name of the node
            url: The url at which the node (category) is available.
            parent: The node's parent.
        """
        self.name = name
        self.url = url
        self.visit_state = VisitState.NEW
        self.children: List[Node] = []
        self.parent: Node = parent

    def get_path(self) -> str:
        """
        Returns: The path of the node separated by slashes (/), excluding the name of the root node. Paths consists of
        the names of the nodes.
        """
        if self.parent is None:
            return ""
        else:
            return self.parent.get_path() + "/" + self.name

    def to_dict(self):
        """
        Returns: A dict representation of the tree rooted at this node.
        """
        node_dict = {"name": self.name, "url": self.url, "visit_state": self.visit_state.name, "children": []}
        if self.children:
            children = [node.to_dict() for node in self.children]
            node_dict.update({"children": children})
        return node_dict

    @classmethod
    def from_dict(cls, node_dict: dict):
        """
        Construct a tree from its dict representation.
        Args:
            node_dict: The dict representation.

        Returns: The restored tree.
        """
        name = node_dict["name"]
        url = node_dict["url"]
        visit_state = VisitState[node_dict["visit_state"]]
        node = Node(name, url)
        node.visit_state = visit_state
        if node_dict["children"]:
            for child_dict in node_dict["children"]:
                child_node = Node.from_dict(child_dict)
                child_node.parent = node
                node.children.append(child_node)
        return node

    def set_visit_state(self, visit_state: VisitState, propagate: bool = False):
        """
        Sets the visit state of this node optionally propagating it to ancestors.
        Args:
            visit_state: The new state.
            propagate: Whether to propagate to ancestors.
        """
        self.visit_state = visit_state
        if self.parent and propagate:
            self.parent.set_visit_state(visit_state, propagate)

Static methods

def from_dict(node_dict: dict)

Construct a tree from its dict representation.

Args

node_dict
The dict representation.

Returns: The restored tree.

Expand source code
@classmethod
def from_dict(cls, node_dict: dict):
    """
    Construct a tree from its dict representation.
    Args:
        node_dict: The dict representation.

    Returns: The restored tree.
    """
    name = node_dict["name"]
    url = node_dict["url"]
    visit_state = VisitState[node_dict["visit_state"]]
    node = Node(name, url)
    node.visit_state = visit_state
    if node_dict["children"]:
        for child_dict in node_dict["children"]:
            child_node = Node.from_dict(child_dict)
            child_node.parent = node
            node.children.append(child_node)
    return node

Methods

def get_path(self) ‑> str

Returns: The path of the node separated by slashes (/), excluding the name of the root node. Paths consists of the names of the nodes.

Expand source code
def get_path(self) -> str:
    """
    Returns: The path of the node separated by slashes (/), excluding the name of the root node. Paths consists of
    the names of the nodes.
    """
    if self.parent is None:
        return ""
    else:
        return self.parent.get_path() + "/" + self.name
def set_visit_state(self, visit_state: VisitState, propagate: bool = False)

Sets the visit state of this node optionally propagating it to ancestors.

Args

visit_state
The new state.
propagate
Whether to propagate to ancestors.
Expand source code
def set_visit_state(self, visit_state: VisitState, propagate: bool = False):
    """
    Sets the visit state of this node optionally propagating it to ancestors.
    Args:
        visit_state: The new state.
        propagate: Whether to propagate to ancestors.
    """
    self.visit_state = visit_state
    if self.parent and propagate:
        self.parent.set_visit_state(visit_state, propagate)
def to_dict(self)

Returns: A dict representation of the tree rooted at this node.

Expand source code
def to_dict(self):
    """
    Returns: A dict representation of the tree rooted at this node.
    """
    node_dict = {"name": self.name, "url": self.url, "visit_state": self.visit_state.name, "children": []}
    if self.children:
        children = [node.to_dict() for node in self.children]
        node_dict.update({"children": children})
    return node_dict
class SiteStructure (name='')

Handles the nodes of the structure.

Attributes

root_node : Node
The root node.

Creates the structure with a root node initialized to the given name prefixed with '(root) '.

Args

name
The name of the root node. (The root node doesn't have an url)
Expand source code
class SiteStructure:
    """
    Handles the nodes of the structure.
    Attributes:
        root_node (Node): The root node.
    """

    def __init__(self, name=""):
        """
        Creates the structure with a root node initialized to the given name prefixed with '(root) '.
        Args:
            name: The name of the root node. (The root node doesn't have an url)
        """
        self.root_node = Node("(root) {}".format(name), "")

    def add_node_with_path(self, path: str, url: str):
        """
        Adds a new node with url under path, where the name of the new node will be the last part of the path.
        Nodes along the path should pre-exists except for the last node.
        Args:
            path (str): The path.
            url (str): The url.

        Returns: The newly created node.
        """
        if self.get_node_at_path(path) is not None:
            raise RuntimeError("Path \"{}\" already exists!".format(path))
        used_path = path.strip("/")
        node_names = self.__split_path(used_path)
        children = self.root_node.children
        parent = self.root_node
        if len(node_names) > 1:
            parent_path = "/".join(node_names[:-1])
            existing = self.get_node_at_path(parent_path)
            if existing:
                parent = existing
                children = existing.children
            else:
                raise RuntimeError("Parent path \"{}\" not existing!".format(parent_path))
        new_node_name = node_names[-1]
        node = Node(new_node_name, url, parent)
        children.append(node)
        return node

    def get_node_at_path(self, path: str) -> Node:
        """
        Gets a node at path.
        Args:
            path (str): The path

        Returns: The node if found, else None.
        """
        children = self.root_node.children
        result_node = None
        used_path = path.strip("/")
        for node_name in self.__split_path(used_path):
            result_node = self.__find_node(node_name, children)
            if result_node is None:
                break
            children = result_node.children
        return result_node

    def to_dict(self):
        """
        Returns: The structure as a dict.
        """
        return self.root_node.to_dict()

    @classmethod
    def from_dict(cls, struct_dict):
        """
        Creates a structure from its dict representation.
        Args:
            struct_dict: The dict.

        Returns: The site structure.
        """
        structure = SiteStructure()
        structure.root_node = Node.from_dict(struct_dict)
        return structure

    def find_leaf_with_visit_state(self, visit_state: Union[VisitState, List[VisitState]]) -> Optional[Node]:
        """
        Finds a leaf node with matching visit state(s). Search is DFS.
        Args:
            visit_state: Either a VisitState, or list of VisitStates. If a list, a match is found when any of the
            list's element matches.

        Returns: The first matching node if found, else None.

        """
        if visit_state is None:
            raise TypeError("Visit state cannot be none")
        if isinstance(visit_state, list) and not visit_state:
            raise ValueError("Visit states is empty!")
        return self.__find_leaf_with_visit_state(visit_state, self.root_node)

    def __str__(self):
        return "\n".join(self.__create_log_msg_records(self.root_node))

    def __find_leaf_with_visit_state(self, visit_state: Union[VisitState, List[VisitState]], node: Node):
        is_leaf = len(node.children) == 0
        if is_leaf and self.__visit_state_matches(visit_state, node):
            return node
        for child in node.children:
            descendant_match = self.__find_leaf_with_visit_state(visit_state, child)
            if descendant_match:
                return descendant_match
        return None

    @staticmethod
    def __visit_state_matches(visit_state, node):
        if isinstance(visit_state, list):
            return node.visit_state in visit_state
        else:
            return node.visit_state == visit_state

    @staticmethod
    def __find_node(name, nodes):
        for node in nodes:
            if node.name == name:
                return node
        return None

    @staticmethod
    def __split_path(path):
        return path.split("/")

    def __create_log_msg_records(self, node: Node, prefix=""):
        records = []
        is_root = node.parent is None
        has_sibling = self.__has_sibling(node)
        node_prefix = self.__create_node_prefix(is_root, has_sibling)
        node_prefix = prefix + node_prefix
        log_msg_record = self.__create_single_log_msg_record(node, node_prefix)
        records.append(log_msg_record)
        carry_on_prefix = self.__create_carry_on_prefix(is_root, has_sibling)
        carry_on_prefix = prefix + carry_on_prefix
        for child in node.children:
            records.extend(self.__create_log_msg_records(child, carry_on_prefix))
        return records

    @staticmethod
    def __create_single_log_msg_record(node, node_prefix):
        is_root = node.parent is None
        if is_root:
            return node.name
        else:
            return "{node_prefix}[{visit_state}] {node_name} ({node_url})".format(
                node_prefix=node_prefix, visit_state=node.visit_state.name, node_name=node.name, node_url=node.url)

    @staticmethod
    def __has_sibling(child):
        if child and child.parent:
            return child.parent.children[-1].name != child.name
        return None

    @staticmethod
    def __create_node_prefix(is_root, has_sibling):
        if is_root:
            return ""
        if has_sibling:
            return "├── "
        else:
            return "└── "

    @staticmethod
    def __create_carry_on_prefix(is_root, has_sibling):
        if is_root:
            return ""
        if has_sibling:
            return "|   "
        else:
            return "    "

Static methods

def from_dict(struct_dict)

Creates a structure from its dict representation.

Args

struct_dict
The dict.

Returns: The site structure.

Expand source code
@classmethod
def from_dict(cls, struct_dict):
    """
    Creates a structure from its dict representation.
    Args:
        struct_dict: The dict.

    Returns: The site structure.
    """
    structure = SiteStructure()
    structure.root_node = Node.from_dict(struct_dict)
    return structure

Methods

def add_node_with_path(self, path: str, url: str)

Adds a new node with url under path, where the name of the new node will be the last part of the path. Nodes along the path should pre-exists except for the last node.

Args

path : str
The path.
url : str
The url.

Returns: The newly created node.

Expand source code
def add_node_with_path(self, path: str, url: str):
    """
    Adds a new node with url under path, where the name of the new node will be the last part of the path.
    Nodes along the path should pre-exists except for the last node.
    Args:
        path (str): The path.
        url (str): The url.

    Returns: The newly created node.
    """
    if self.get_node_at_path(path) is not None:
        raise RuntimeError("Path \"{}\" already exists!".format(path))
    used_path = path.strip("/")
    node_names = self.__split_path(used_path)
    children = self.root_node.children
    parent = self.root_node
    if len(node_names) > 1:
        parent_path = "/".join(node_names[:-1])
        existing = self.get_node_at_path(parent_path)
        if existing:
            parent = existing
            children = existing.children
        else:
            raise RuntimeError("Parent path \"{}\" not existing!".format(parent_path))
    new_node_name = node_names[-1]
    node = Node(new_node_name, url, parent)
    children.append(node)
    return node
def find_leaf_with_visit_state(self, visit_state: Union[VisitState, List[VisitState]]) ‑> Union[Node, NoneType]

Finds a leaf node with matching visit state(s). Search is DFS.

Args

visit_state
Either a VisitState, or list of VisitStates. If a list, a match is found when any of the

list's element matches. Returns: The first matching node if found, else None.

Expand source code
def find_leaf_with_visit_state(self, visit_state: Union[VisitState, List[VisitState]]) -> Optional[Node]:
    """
    Finds a leaf node with matching visit state(s). Search is DFS.
    Args:
        visit_state: Either a VisitState, or list of VisitStates. If a list, a match is found when any of the
        list's element matches.

    Returns: The first matching node if found, else None.

    """
    if visit_state is None:
        raise TypeError("Visit state cannot be none")
    if isinstance(visit_state, list) and not visit_state:
        raise ValueError("Visit states is empty!")
    return self.__find_leaf_with_visit_state(visit_state, self.root_node)
def get_node_at_path(self, path: str) ‑> Node

Gets a node at path.

Args

path : str
The path

Returns: The node if found, else None.

Expand source code
def get_node_at_path(self, path: str) -> Node:
    """
    Gets a node at path.
    Args:
        path (str): The path

    Returns: The node if found, else None.
    """
    children = self.root_node.children
    result_node = None
    used_path = path.strip("/")
    for node_name in self.__split_path(used_path):
        result_node = self.__find_node(node_name, children)
        if result_node is None:
            break
        children = result_node.children
    return result_node
def to_dict(self)

Returns: The structure as a dict.

Expand source code
def to_dict(self):
    """
    Returns: The structure as a dict.
    """
    return self.root_node.to_dict()
class VisitState (value, names=None, *, module=None, qualname=None, type=None, start=1)

Visit state of a category (node)

Expand source code
class VisitState(Enum):
    """Visit state of a category (node)"""
    NEW = 0
    IN_PROGRESS = 1
    VISITED = 2

Ancestors

  • enum.Enum

Class variables

var IN_PROGRESS
var NEW
var VISITED