Scraper class¶

Convenience class to easily use the available decorators.

Source code in dude/scraper.py

class Scraper(ScraperBase):
    """
    Convenience class to easily use the available decorators.
    """

    def run(
        self,
        urls: Sequence[str],
        pages: int = 1,
        proxy: Optional[Any] = None,
        output: Optional[str] = None,
        format: str = "json",
        follow_urls: bool = False,
        save_per_page: bool = False,
        ignore_robots_txt: bool = False,
        # extra args
        parser: str = "playwright",
        headless: bool = True,
        browser_type: str = "chromium",
        **kwargs: Any,
    ) -> None:
        """
        Convenience method to handle switching between different types of parser backends.

        :param urls: List of website URLs.
        :param pages: Maximum number of pages to crawl before exiting (default=1). This is only used when a navigate handler is defined. # noqa
        :param proxy: Proxy settings.
        :param output: Output file. If not provided, prints in the terminal.
        :param format: Output file format. If not provided, uses the extension of the output file or defaults to json.
        :param follow_urls: Automatically follow URLs.
        :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
        :param ignore_robots_txt: Flag to ignore robots.txt.

        :param parser: Parser backend ["playwright" (default), "bs4", "parsel, "lxml" or "selenium"]
        :param headless: Enables headless browser. (default=True)
        :param browser_type: Playwright supported browser types ("chromium", "chrome", "webkit", or "firefox").
        """

        logger.info("Scraper started...")

        if not self.scraper:
            scraper_class: Type[ScraperBase]
            if parser == "bs4":
                from .optional.beautifulsoup_scraper import BeautifulSoupScraper

                scraper_class = BeautifulSoupScraper
            elif parser == "parsel":
                from .optional.parsel_scraper import ParselScraper

                scraper_class = ParselScraper
            elif parser == "lxml":
                from .optional.lxml_scraper import LxmlScraper

                scraper_class = LxmlScraper
            elif parser == "selenium":
                from .optional.selenium_scraper import SeleniumScraper

                scraper_class = SeleniumScraper
            else:
                scraper_class = PlaywrightScraper

            self.scraper = scraper_class(
                rules=self.rules,
                groups=self.groups,
                save_rules=self.save_rules,
                events=self.events,
                has_async=self.has_async,
                requests=self.requests,
            )

        if not ignore_robots_txt:
            logger.info(
                f"""robots.txt is currently not ignored.
        {"=" * 80}
        Any rules/restrictions set in a website's robots.txt, will be followed by default.
        To ignore robots.txt, add `--ignore-robots-txt` to CLI arguments or  pass `ignore_robots_txt=True` to `run()`
        {"=" * 80}""",
            )

        self.scraper.run(
            urls=urls,
            pages=pages,
            proxy=proxy,
            output=output,
            format=format,
            follow_urls=follow_urls,
            save_per_page=save_per_page or follow_urls,
            ignore_robots_txt=ignore_robots_txt,
            **{"headless": headless, "browser_type": browser_type},
        )

`run(self, urls, pages=1, proxy=None, output=None, format='json', follow_urls=False, save_per_page=False, ignore_robots_txt=False, parser='playwright', headless=True, browser_type='chromium', **kwargs)` ¶

Convenience method to handle switching between different types of parser backends.

:param urls: List of website URLs. :param pages: Maximum number of pages to crawl before exiting (default=1). This is only used when a navigate handler is defined. # noqa :param proxy: Proxy settings. :param output: Output file. If not provided, prints in the terminal. :param format: Output file format. If not provided, uses the extension of the output file or defaults to json. :param follow_urls: Automatically follow URLs. :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end. :param ignore_robots_txt: Flag to ignore robots.txt.

:param parser: Parser backend ["playwright" (default), "bs4", "parsel, "lxml" or "selenium"] :param headless: Enables headless browser. (default=True) :param browser_type: Playwright supported browser types ("chromium", "chrome", "webkit", or "firefox").

Source code in dude/scraper.py

def run(
    self,
    urls: Sequence[str],
    pages: int = 1,
    proxy: Optional[Any] = None,
    output: Optional[str] = None,
    format: str = "json",
    follow_urls: bool = False,
    save_per_page: bool = False,
    ignore_robots_txt: bool = False,
    # extra args
    parser: str = "playwright",
    headless: bool = True,
    browser_type: str = "chromium",
    **kwargs: Any,
) -> None:
    """
    Convenience method to handle switching between different types of parser backends.

    :param urls: List of website URLs.
    :param pages: Maximum number of pages to crawl before exiting (default=1). This is only used when a navigate handler is defined. # noqa
    :param proxy: Proxy settings.
    :param output: Output file. If not provided, prints in the terminal.
    :param format: Output file format. If not provided, uses the extension of the output file or defaults to json.
    :param follow_urls: Automatically follow URLs.
    :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
    :param ignore_robots_txt: Flag to ignore robots.txt.

    :param parser: Parser backend ["playwright" (default), "bs4", "parsel, "lxml" or "selenium"]
    :param headless: Enables headless browser. (default=True)
    :param browser_type: Playwright supported browser types ("chromium", "chrome", "webkit", or "firefox").
    """

    logger.info("Scraper started...")

    if not self.scraper:
        scraper_class: Type[ScraperBase]
        if parser == "bs4":
            from .optional.beautifulsoup_scraper import BeautifulSoupScraper

            scraper_class = BeautifulSoupScraper
        elif parser == "parsel":
            from .optional.parsel_scraper import ParselScraper

            scraper_class = ParselScraper
        elif parser == "lxml":
            from .optional.lxml_scraper import LxmlScraper

            scraper_class = LxmlScraper
        elif parser == "selenium":
            from .optional.selenium_scraper import SeleniumScraper

            scraper_class = SeleniumScraper
        else:
            scraper_class = PlaywrightScraper

        self.scraper = scraper_class(
            rules=self.rules,
            groups=self.groups,
            save_rules=self.save_rules,
            events=self.events,
            has_async=self.has_async,
            requests=self.requests,
        )

    if not ignore_robots_txt:
        logger.info(
            f"""robots.txt is currently not ignored.
    {"=" * 80}
    Any rules/restrictions set in a website's robots.txt, will be followed by default.
    To ignore robots.txt, add `--ignore-robots-txt` to CLI arguments or  pass `ignore_robots_txt=True` to `run()`
    {"=" * 80}""",
        )

    self.scraper.run(
        urls=urls,
        pages=pages,
        proxy=proxy,
        output=output,
        format=format,
        follow_urls=follow_urls,
        save_per_page=save_per_page or follow_urls,
        ignore_robots_txt=ignore_robots_txt,
        **{"headless": headless, "browser_type": browser_type},
    )

Scraper class¶

run(self, urls, pages=1, proxy=None, output=None, format='json', follow_urls=False, save_per_page=False, ignore_robots_txt=False, parser='playwright', headless=True, browser_type='chromium', **kwargs) ¶

`run(self, urls, pages=1, proxy=None, output=None, format='json', follow_urls=False, save_per_page=False, ignore_robots_txt=False, parser='playwright', headless=True, browser_type='chromium', **kwargs)` ¶