Source code for novelsave_sources.sources.crawler

import datetime
import re
from abc import ABC
from typing import List, Union, Optional
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup, Comment
from requests.cookies import RequestsCookieJar

from ..exceptions import BadResponseException
from ..utils.gateways import BaseHttpGateway, DefaultHttpGateway


[docs]class Crawler(ABC): """Base crawler class Implements crawler specific helper methods that can be used when parsing html content Attributes: lang (str): The language of the content available through the source. It is specified ``multi`` if the source supports multiple languages. base_urls (List[str]): The hostnames of the websites that this crawler supports. last_updated (datetime.date): The date at which the specific crawler implementation was last updated. bad_tags (List[str]): List of names of tags that should be removed from chapter content for this specific crawler. blacklist_patterns (List[str]): List of regex patterns denoting text that should be removed from chapter content. notext_tags (List[str]): List of names of tags that even if there is no text should not be removed from chapter content. Elements with no text are usually removed from the chapter content, unless the element is specified in this list. preserve_attrs (List[str]): Element attributes that contain meaningful content and should be kept with in the element during attribute cleanup. """ lang: str base_urls: List[str] last_updated: datetime.date
[docs] @classmethod def of(cls, url: str) -> bool: """Check whether the url is from the this source The source implementations may override this method to provide custom matching functionality. The default implementation checks if the hostname of the url matches any of the base urls of the source. :param url: The url to test if it belongs to this source :type url: str :return: Whether the url is from this source :rtype: bool """ return any(url.startswith(base_url) for base_url in cls.base_urls)
def __init__(self, http_gateway: BaseHttpGateway = None): self.http_gateway = ( http_gateway if http_gateway is not None else DefaultHttpGateway() ) self.init()
[docs] def init(self): """Call this method instead of __init__ for trivial purposes The purpose can be any of: - editing bad_tags or blacklist_patterns """
def set_cookies(self, cookies: RequestsCookieJar): self.http_gateway.cookies = cookies
[docs] def get_soup(self, url: str, method: str = "GET", **kwargs) -> BeautifulSoup: """Makes a request to the url and attempts to make a :class:`BeautifulSoup` object from the response content. Once the response is acquired, soup object is created using :meth:`~novelsave_sources.sources.Crawler.make_soup`. Then the soup object is checked for the ``body`` to check if document was retrieved successfully. :param url: forwarded to :meth:`~novelsave_sources.sources.Crawler.request` :type url: str :param method: forwarded to :meth:`~novelsave_sources.sources.Crawler.request` :type method: str :param kwargs: forwarded to :meth:`~novelsave_sources.sources.Crawler.request` :return: The created soup object :rtype: BeautifulSoup :raises ConnectionError: If document was not retrieved successfully """ soup = self.make_soup(self.request(method, url, **kwargs).content, "lxml") if not soup.find("body"): raise ConnectionError("HTML document was not loaded correctly.") return soup
[docs] @staticmethod def make_soup(text: Union[str, bytes], parser: str = "lxml") -> BeautifulSoup: """Create a new soup object using the specified parser :param text: The content for the soup :type text: str | bytes :param parser: The html tree parser to use (default = 'lxml') :type parser: str :return: The created soup object :rtype: BeautifulSoup """ return BeautifulSoup(text, parser)
[docs] def request(self, method: str, url: str, **kwargs) -> requests.Response: """Send a request to the provided url using the specified method Checks if the response is valid before returning, if its not valid throws an exception. :param method: Request method ex: GET, POST, PUT :type method: str :param url: The url endpoint to make the request to :type url: str :param kwargs: Forwarded to :meth:`http_gateway.request <novelsave_sources.utils.gateways.BaseHttpGateway.request>` :return: The response from the request :rtype: requests.Response :raises BadResponseException: if the response is not valid (status code != 200) """ response = self.http_gateway.request(method, url, **kwargs) if not response.ok: raise BadResponseException(response) return response
def request_get(self, url, **kwargs): """Creates a get request to the specified url""" return self.request("GET", url, **kwargs) # ---- Inspired from https://github.com/dipu-bd/lightnovel-crawler ---- # ---- And almost a perfect copy of the functions below ---- bad_tags = [ "noscript", "script", "style", "iframe", "ins", "header", "footer", "button", "input", "amp-auto-ads", "pirate", "figcaption", "address", "tfoot", "object", "video", "audio", "source", "nav", "output", "select", "textarea", "form", "map", ] blacklist_patterns = [] notext_tags = [ "img", ] preserve_attrs = [ "href", "src", "alt", ]
[docs] def is_blacklisted(self, text): """Whether the text is blacklisted""" return any( re.search(pattern, text, re.IGNORECASE) for pattern in self.blacklist_patterns )
[docs] def clean_contents(self, contents): """Remove unnecessary elements and attributes""" if not contents: return contents contents.attrs = {} for element in contents.find_all(True): self.clean_element(element) return contents
[docs] def clean_element(self, element): """ If the element does not add any meaningful content the element is removed, this can happen on either of below conditions. - Element is a comment - Element is a <br> and the next sibling element is also a <br> - Element is part of the bad tags (undesired tags that dont add content) - The element has no text and has no children and is not part of notext_tags (elements that doesnt need text to be meaningful) - The text of the element matches one of the blacklisted patterns (undesirable text such as ads and watermarks) If none of the conditions are met, all the attributes except those marked important :attr:`preserve_attrs` are removed from this element """ # remove comments if isinstance(element, Comment): element.extract() elif element.name == "br": next_element = getattr(element, "next_sibling") if next_element and next_element.name == "br": element.extract() # Remove bad tags elif element.name in self.bad_tags: element.extract() # Remove empty elements elif not element.text.strip(): if element.name not in self.notext_tags and not element.find_all( recursive=False ): element.extract() # Remove blacklisted elements elif self.is_blacklisted(element.text): element.extract() # Remove attributes elif hasattr(element, "attrs"): element.attrs = { key: element.get(key) for key in self.preserve_attrs if key in element.attrs }
[docs] @staticmethod def find_paragraphs(element, **kwargs) -> List[str]: """Extract all text of the element into paragraphs""" paragraphs = [] for t in element.find_all(text=True, **kwargs): text = str(t).strip() if not text: continue paragraphs.append(text) return paragraphs
[docs] def to_absolute_url(self, url: str, current_url: Optional[str] = None) -> str: """Detects the url state and converts it into the appropriate absolute url There are several relevant states the url could be in: - absolute: starts with either 'https://' or 'http://', in this the url is returned as it without any changes. - missing schema: schema is missing and the url starts with '//', in this case the appropriate schema from either current url or base url is prefixed. - relative absolute: the url is relative to the website and starts with '/', in this case the base website location (netloc) is prefixed to the url: - relative current: the url is relative to the current webpage and does not match any of the above conditions, in this case the url is added to the current url provided. :param url: The url to be converted :type url: str :param current_url: The webpage from which the url is extracted :type current_url: Optional[str] :return: The absolute converted url :rtype: str """ if url.startswith("http://") or url.startswith("https://"): return url if url.startswith("//"): return f"{urlparse(current_url or self.base_urls[0]).scheme}:{url}" elif url.startswith("/"): return self.base_urls[0].rstrip("/") + url return current_url.rstrip("/") + url