Source code for pyppeteer.launcher

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Chromium process launcher module."""

import asyncio
import atexit
import json
from urllib.request import urlopen
from urllib.error import URLError
import logging
import os
import os.path
from pathlib import Path
import shutil
import signal
import subprocess
import sys
import tempfile
import time
from typing import Any, Dict, List, TYPE_CHECKING

from pyppeteer import __pyppeteer_home__
from pyppeteer.browser import Browser
from pyppeteer.connection import Connection
from pyppeteer.errors import BrowserError
from pyppeteer.helper import addEventListener, debugError, removeEventListeners
from pyppeteer.target import Target
from pyppeteer.util import check_chromium, chromium_executable
from pyppeteer.util import download_chromium, merge_dict, get_free_port

if TYPE_CHECKING:
    from typing import Optional  # noqa: F401

logger = logging.getLogger(__name__)

pyppeteer_home = Path(__pyppeteer_home__)
CHROME_PROFILE_PATH = pyppeteer_home / '.dev_profile'

DEFAULT_ARGS = [
    '--disable-background-networking',
    '--disable-background-timer-throttling',
    '--disable-breakpad',
    '--disable-browser-side-navigation',
    '--disable-client-side-phishing-detection',
    '--disable-default-apps',
    '--disable-dev-shm-usage',
    '--disable-extensions',
    '--disable-features=site-per-process',
    '--disable-hang-monitor',
    '--disable-popup-blocking',
    '--disable-prompt-on-repost',
    '--disable-sync',
    '--disable-translate',
    '--metrics-recording-only',
    '--no-first-run',
    '--safebrowsing-disable-auto-update',
]

AUTOMATION_ARGS = [
    '--enable-automation',
    '--password-store=basic',
    '--use-mock-keychain',
]


class Launcher(object):
    """Chrome process launcher class."""

    def __init__(self, options: Dict[str, Any] = None,  # noqa: C901
                 **kwargs: Any) -> None:
        """Make new launcher."""
        self.options = merge_dict(options, kwargs)
        self.port = get_free_port()
        self.url = f'http://127.0.0.1:{self.port}'
        self.chrome_args: List[str] = []
        self._loop = self.options.get('loop', asyncio.get_event_loop())

        logLevel = self.options.get('logLevel')
        if logLevel:
            logging.getLogger('pyppeteer').setLevel(logLevel)

        if not self.options.get('ignoreDefaultArgs', False):
            self.chrome_args.extend(DEFAULT_ARGS)
            self.chrome_args.append(
                f'--remote-debugging-port={self.port}',
            )

        self.chromeClosed = True
        if self.options.get('appMode', False):
            self.options['headless'] = False
        elif not self.options.get('ignoreDefaultArgs', False):
            self.chrome_args.extend(AUTOMATION_ARGS)

        self._tmp_user_data_dir: Optional[str] = None
        self._parse_args()

        if self.options.get('devtools'):
            self.chrome_args.append('--auto-open-devtools-for-tabs')
            self.options['headless'] = False

        if 'headless' not in self.options or self.options.get('headless'):
            self.chrome_args.extend([
                '--headless',
                '--disable-gpu',
                '--hide-scrollbars',
                '--mute-audio',
            ])

        def _is_default_url() -> bool:
            for arg in self.options['args']:
                if not arg.startswith('-'):
                    return False
            return True

        if (not self.options.get('ignoreDefaultArgs') and
                isinstance(self.options.get('args'), list) and
                _is_default_url()):
            self.chrome_args.append('about:blank')

        if 'executablePath' in self.options:
            self.exec = self.options['executablePath']
        else:
            if not check_chromium():
                download_chromium()
            self.exec = str(chromium_executable())

        self.cmd = [self.exec] + self.chrome_args

    def _parse_args(self) -> None:
        if (not isinstance(self.options.get('args'), list) or
                not any(opt for opt in self.options['args']
                        if opt.startswith('--user-data-dir'))):
            if 'userDataDir' not in self.options:
                if not CHROME_PROFILE_PATH.exists():
                    CHROME_PROFILE_PATH.mkdir(parents=True)
                self._tmp_user_data_dir = tempfile.mkdtemp(
                    dir=str(CHROME_PROFILE_PATH))
            self.chrome_args.append('--user-data-dir={}'.format(
                self.options.get('userDataDir', self._tmp_user_data_dir)))
        if isinstance(self.options.get('args'), list):
            self.chrome_args.extend(self.options['args'])

    def _cleanup_tmp_user_data_dir(self) -> None:
        for retry in range(100):
            if self._tmp_user_data_dir and os.path.exists(
                    self._tmp_user_data_dir):
                shutil.rmtree(self._tmp_user_data_dir, ignore_errors=True)
                if os.path.exists(self._tmp_user_data_dir):
                    time.sleep(0.01)
            else:
                break
        else:
            raise IOError('Unable to remove Temporary User Data')

    async def launch(self) -> Browser:  # noqa: C901
        """Start chrome process and return `Browser` object."""
        self.chromeClosed = False
        self.connection: Optional[Connection] = None

        options = dict()
        options['env'] = self.options.get('env')
        if not self.options.get('dumpio'):
            options['stdout'] = subprocess.PIPE
            options['stderr'] = subprocess.STDOUT

        self.proc = subprocess.Popen(  # type: ignore
            self.cmd,
            **options,
        )

        def _close_process(*args: Any, **kwargs: Any) -> None:
            if not self.chromeClosed:
                self._loop.run_until_complete(self.killChrome())

        # don't forget to close browser process
        if self.options.get('autoClose', True):
            atexit.register(_close_process)
        if self.options.get('handleSIGINT', True):
            signal.signal(signal.SIGINT, _close_process)
        if self.options.get('handleSIGTERM', True):
            signal.signal(signal.SIGTERM, _close_process)
        if not sys.platform.startswith('win'):
            # SIGHUP is not defined on windows
            if self.options.get('handleSIGHUP', True):
                signal.signal(signal.SIGHUP, _close_process)

        connectionDelay = self.options.get('slowMo', 0)
        self.browserWSEndpoint = self._get_ws_endpoint()
        logger.info(f'Browser listening on: {self.browserWSEndpoint}')
        self.connection = Connection(
            self.browserWSEndpoint, self._loop, connectionDelay)
        ignoreHTTPSErrors = bool(self.options.get('ignoreHTTPSErrors', False))
        setDefaultViewport = not self.options.get('appMode', False)
        browser = await Browser.create(
            self.connection, [], ignoreHTTPSErrors, setDefaultViewport,
            self.proc, self.killChrome)
        await self.ensureInitialPage(browser)
        return browser

    async def ensureInitialPage(self, browser: Browser) -> None:
        """Wait for initial page target to be created."""
        for target in browser.targets():
            if target.type == 'page':
                return

        initialPagePromise = self._loop.create_future()

        def initialPageCallback() -> None:
            initialPagePromise.set_result(True)

        def check_target(target: Target) -> None:
            if target.type == 'page':
                initialPageCallback()

        listeners = [addEventListener(browser, 'targetcreated', check_target)]
        await initialPagePromise
        removeEventListeners(listeners)

    def _get_ws_endpoint(self) -> str:
        url = self.url + '/json/version'
        while self.proc.poll() is None:
            time.sleep(0.1)
            try:
                with urlopen(url) as f:
                    data = json.loads(f.read().decode())
                break
            except URLError as e:
                continue
        else:
            raise BrowserError(
                'Browser closed unexpectedly:\n{}'.format(
                    self.proc.stdout.read().decode()
                )
            )
        return data['webSocketDebuggerUrl']

    def waitForChromeToClose(self) -> None:
        """Terminate chrome."""
        if self.proc.poll() is None and not self.chromeClosed:
            self.chromeClosed = True
            try:
                self.proc.terminate()
                self.proc.wait()
            except Exception:
                # browser process may be already closed
                pass

    async def killChrome(self) -> None:
        """Terminate chromium process."""
        logger.info('terminate chrome process...')
        if self.connection and self.connection._connected:
            try:
                await self.connection.send('Browser.close')
                await self.connection.dispose()
            except Exception as e:
                # ignore errors on browser termination process
                debugError(logger, e)
        if self._tmp_user_data_dir and os.path.exists(self._tmp_user_data_dir):
            # Force kill chrome only when using temporary userDataDir
            self.waitForChromeToClose()
            self._cleanup_tmp_user_data_dir()


[docs]async def launch(options: dict = None, **kwargs: Any) -> Browser: """Start chrome process and return :class:`~pyppeteer.browser.Browser`. This function is a shortcut to :meth:`Launcher(options, **kwargs).launch`. Available options are: * ``ignoreHTTPSErrors`` (bool): Whether to ignore HTTPS errors. Defaults to ``False``. * ``headless`` (bool): Whether to run browser in headless mode. Defaults to ``True`` unless ``appMode`` or ``devtools`` options is ``True``. * ``executablePath`` (str): Path to a Chromium or Chrome executable to run instead of default bundled Chromium. * ``slowMo`` (int|float): Slow down pyppeteer operations by the specified amount of milliseconds. * ``args`` (List[str]): Additional arguments (flags) to pass to the browser process. * ``ignoreDefaultArgs`` (bool): Do not use pyppeteer's default args. This is dangerous option; use with care. * ``handleSIGINT`` (bool): Close the browser process on Ctrl+C. Defaults to ``True``. * ``handleSIGTERM`` (bool): Close the browser process on SIGTERM. Defaults to ``True``. * ``handleSIGHUP`` (bool): Close the browser process on SIGHUP. Defaults to ``True``. * ``dumpio`` (bool): Whether to pipe the browser process stdout and stderr into ``process.stdout`` and ``process.stderr``. Defaults to ``False``. * ``userDataDir`` (str): Path to a user data directory. * ``env`` (dict): Specify environment variables that will be visible to the browser. Defaults to same as python process. * ``devtools`` (bool): Whether to auto-open a DevTools panel for each tab. If this option is ``True``, the ``headless`` option will be set ``False``. * ``logLevel`` (int|str): Log level to print logs. Defaults to same as the root logger. * ``autoClose`` (bool): Automatically close browser process when script completed. Defaults to ``True``. * ``loop`` (asyncio.AbstractEventLoop): Event loop (**experimental**). * ``appMode`` (bool): Deprecated. .. note:: Pyppeteer can also be used to control the Chrome browser, but it works best with the version of Chromium it is bundled with. There is no guarantee it will work with any other version. Use ``executablePath`` option with extreme caution. """ return await Launcher(options, **kwargs).launch()
[docs]async def connect(options: dict = None, **kwargs: Any) -> Browser: """Connect to the existing chrome. ``browserWSEndpoint`` option is necessary to connect to the chrome. The format is ``ws://${host}:${port}/devtools/browser/<id>``. This value can get by :attr:`~pyppeteer.browser.Browser.wsEndpoint`. Available options are: * ``browserWSEndpoint`` (str): A browser websocket endpoint to connect to. (**required**) * ``ignoreHTTPSErrors`` (bool): Whether to ignore HTTPS errors. Defaults to ``False``. * ``slowMo`` (int|float): Slow down pyppeteer's by the specified amount of milliseconds. * ``logLevel`` (int|str): Log level to print logs. Defaults to same as the root logger. * ``loop`` (asyncio.AbstractEventLoop): Event loop (**experimental**). """ options = merge_dict(options, kwargs) logLevel = options.get('logLevel') if logLevel: logging.getLogger('pyppeteer').setLevel(logLevel) browserWSEndpoint = options.get('browserWSEndpoint') if not browserWSEndpoint: raise BrowserError('Need `browserWSEndpoint` option.') connectionDelay = options.get('slowMo', 0) connection = Connection(browserWSEndpoint, options.get('loop', asyncio.get_event_loop()), connectionDelay) browserContextIds = (await connection.send('Target.getBrowserContexts') ).get('browserContextIds', []) ignoreHTTPSErrors = bool(options.get('ignoreHTTPSErrors', False)) return await Browser.create( connection, browserContextIds, ignoreHTTPSErrors, True, None, lambda: connection.send('Browser.close'))
[docs]def executablePath() -> str: """Get executable path of default chrome.""" return str(chromium_executable())
def defaultArgs() -> List[str]: """Get list of default chrome args.""" return DEFAULT_ARGS + AUTOMATION_ARGS