From adca188afa724595a0efbcc0ed68902085ffa888 Mon Sep 17 00:00:00 2001 From: Neill Cox Date: Sun, 24 Dec 2023 18:07:32 +1100 Subject: [PATCH] First working version --- .gitignore | 5 + README.md | 3 - credentials.example | 19 + download.sh.example | 17 + pyproject.toml | 33 ++ src/explore.py | 7 + src/patreon_dl/__init__.py | 0 src/patreon_dl/main.py | 879 +++++++++++++++++++++++++++++++++++++ 8 files changed, 960 insertions(+), 3 deletions(-) create mode 100644 credentials.example create mode 100644 download.sh.example create mode 100644 pyproject.toml create mode 100644 src/explore.py create mode 100644 src/patreon_dl/__init__.py create mode 100644 src/patreon_dl/main.py diff --git a/.gitignore b/.gitignore index 5d381cc..c836752 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,8 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +download.sh +credentials.sh +*sql3 +dl.log diff --git a/README.md b/README.md index a144897..e69de29 100644 --- a/README.md +++ b/README.md @@ -1,3 +0,0 @@ -# patreon-dl - -A patreon downloader \ No newline at end of file diff --git a/credentials.example b/credentials.example new file mode 100644 index 0000000..e1f7e1f --- /dev/null +++ b/credentials.example @@ -0,0 +1,19 @@ +# +# The value for these environment variables can be found by logging into your +# patreon account and using developer tools to find the cookie being sent. +# +# Source a copy of this file with actual credentials and then: +# patreon-dl --log-level=INFO -d $HOME/Downloads/PDL +# +# Adjusting the log level and destination as desired. +# +# TODO: More detailed explanation + +export PDL_CREATOR="czepeku" +export PDL_DEVICE_ID="" +export PDL_COUNTRY_CODE="AU" +export PDL_LOCALE="en-US" +export PDL_CURRENCY="USD" +export PDL_SESSION_ID=" +export PDL_ANALYTICS_SESSION_ID=" +export PDL_CF_BM=" diff --git a/download.sh.example b/download.sh.example new file mode 100644 index 0000000..1b855e4 --- /dev/null +++ b/download.sh.example @@ -0,0 +1,17 @@ +# +# The value for these environment variables can be found by logging into your +# patreon account and using developer tools to find the cookie being sent. +# +# TODO: More detailes explanation + +export PDL_CREATOR="czepeku" +export PDL_DEVICE_ID="" +export PDL_COUNTRY_CODE="AU" +export PDL_LOCALE="en-US" +export PDL_CURRENCY="USD" +export PDL_SESSION_ID=" +export PDL_ANALYTICS_SESSION_ID=" +export PDL_CF_BM=" + +patreon-dl --log-level=INFO \ + -d $HOME/Downloads/PDL diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..914be9e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,33 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "patreon-dl" +version = "0.0.1" +authors = [ + { name="Neill Cox", email="neill@ingenious.com.au" }, +] +description = "A tool for downloading content from patreon" +readme = "README.md" +requires-python = ">=3.12" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +dependencies = [ + "requests", + "tabulate", + "click", + "requests", + "bs4", + "selenium", +] + +[project.urls] +"Homepage" = "https://gitlab.com/neillc/patreon-dl" +"Bug Tracker" = "https://gitlab.com/neillc/patreon-dl/issues" + +[project.scripts] +patreon-dl = "patreon_dl.main:main" diff --git a/src/explore.py b/src/explore.py new file mode 100644 index 0000000..275e784 --- /dev/null +++ b/src/explore.py @@ -0,0 +1,7 @@ +import os +import zipfile + +for root, dirs, files in os.walk("/home/ncox/OMV_Shared/download/PDL/tomcartos/"): + for file in files: + zfile = zipfile.ZipFile(root + file) + import bpdb;bpdb.set_trace() diff --git a/src/patreon_dl/__init__.py b/src/patreon_dl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/patreon_dl/main.py b/src/patreon_dl/main.py new file mode 100644 index 0000000..2706f1c --- /dev/null +++ b/src/patreon_dl/main.py @@ -0,0 +1,879 @@ +""" +A tool to download all the available content from a patreon campaign. + +The tool keeps track of state using a sqlite3 database. It will not revisit +posts it has already seen (I plan to add a parameter to set a maximum age for +overriding this). + +Using sqlite3 means you can stop and restart the download without having to +start from the beginnning. + +It will not download a file if a file with the same name and size already +exists locally unless you specify --force, but that will download everything. + +It will crawl every post available from a page (I plan at some point to add a +parameter to specify a single page) + +It uses Firefox as its web browser (I plan to add a parameter to allow Chrome +and maybe Safari at some point) + +At some point I will add a parameter to allow using headless browsers, but for +now as I test I find it reassuring to see progress. + +This was heavily influenced by: + - C# + - Node + +I'm not particularly fluent in either of those languages so I wrote my own in +python. + +Future ideas: + - It would be good to keep track of what has been previously downloaded. + - It would be nice to not show the firefox window +""" +import datetime +import logging +import os +import re +import sqlite3 +import sys + +import click +import requests + +# import beautifulsoup4 +from bs4 import BeautifulSoup + +from selenium import webdriver +from selenium.webdriver.firefox.options import Options as FirefoxOptions + +from selenium.common.exceptions import ( + JavascriptException, + WebDriverException, +) + +file_errors = [] + + +def add_file(conn, file, creator): + """Add a file to the list to download.""" + if not in_files(conn, file): + conn.execute( + "insert into files (href, creator, added_at) values(:href, :creator, :now)", + { + "href": file, + "creator": creator, + "now": datetime.datetime.now().timestamp(), + }, + ) + conn.commit() + + +def add_to_unvisited(conn, link, creator): + """Add the post to the list of unvisted pages.""" + conn.execute( + "insert into unvisited_links (href, creator, added_at) values(:href, :creator, :added_at)", + { + "href": link, + "creator": creator, + "added_at": datetime.datetime.now().timestamp(), + }, + ) + conn.commit() + + +def add_to_visited(conn, link, creator, error=None): + """Ad the post to the list of visited posts""" + conn.execute( + ( + "insert into visited_links (href, visited_at, error, creator) " + "values(:href, :now, :error, :creator)"), + { + "href": link, + "now": datetime.datetime.now().timestamp(), + "error": error, + "creator": creator, + }, + ) + conn.execute( + "delete from unvisited_links where href = :link", {"link": link} + ) + conn.commit() + + +def already_seen(conn, post): + """Has this post already been seen?""" + return unvisited(conn, post) or visited(conn, post) + + +def cookie_dict(config): + """Create a dict of cookies from the parameters passed on the command line""" + cookies = { + "patreon_device_id": config["device_id"], + "patreon_location_country_code": config["country_code"], + "patreon_locale_code": config["locale"], + "patreon_currency_pref": config["currency"], + "session_id": config["session_id"], + "analytics_session_id": config["analytics_session_id"], + "__cf_bm": config["cf_bm"], + } + return cookies + + +def count_files(conn, creator): + """Count all the files we have found for the creator""" + return conn.execute( + "select count(*) from files where creator = :creator", + {"creator": creator}, + ).fetchone()[0] + + +def count_files_downloaded(conn, creator): + """Get the count of the number of files already downloaded for this creator""" + return conn.execute( + "select count(*) from files where creator = :creator and downloaded_at is not null", + {"creator": creator}, + ).fetchone()[0] ## or downloaded_at < max_age") + + +def count_posts_to_visit(conn, creator): + """Get the count of unvisited posts""" + return conn.execute( + "select count(*) from unvisited_links where creator = :creator", {"creator":creator} + ).fetchone()[0] + + +def download_creator(config: dict): + """Use selenium to download.""" + + # cookies = cookie_dict(config) + conn = initialise_database(config["db_path"]) + config["conn"] = conn + creator = config["creator"] + remove_creator_home(conn, creator) + + web_browser = webdriver.Firefox + options = webdriver.FirefoxOptions() + options.add_argument("-headless") + with web_browser(options=options) as driver: + config["driver"] = driver + login_to_patreon(config, driver) + logging.info( + "Downloading files for campiagn (%s - %s)", + config["campaign_id"], + config["campaign_name"], + ) + + #if config["resume"]: + #url = get_next_unvisited(config) + #else: + #url = f"https://www.patreon.com/{creator}/posts" + url = f"https://www.patreon.com/{creator}/posts" + + find_posts(config, url) + + # Make sure no files have been missed. + download_files(config) + + +def download_file(link, cookies, download_destination, force_download): + """Download a file using the requests library""" + logging.debug("downloading %s", link) + # return + + with requests.get( + link, stream=True, cookies=cookies, timeout=120 + ) as result: + if not result.ok: + if result.status_code == 404: + logging.warning("File %s not found", link) + file_error(result.status_code, link) + return link + if result.status_code == 403: + logging.warning( + "Permission denied when downloading %s", link + ) + file_error(result.status_code, link) + return link + + result.raise_for_status() + + local_filename = get_filename_from_header(result.headers) + + stream = True + try: + size = int(result.headers["Content-Length"]) + except KeyError: + # Nt content-length, we can't stream this object. Just download it directly. + size = len(result.content) + stream = False + logging.debug( + "No content-lentg set. Will download directly" + ) + + filename = download_destination + "/" + local_filename + + if not force_download: + try: + stats = os.stat(filename) + if stats.st_size == size: + logging.debug( + f"A file with the same name ({local_filename}) and " + f"size ({size}) has already been downloaded. Skipping" + ) + return local_filename + except FileNotFoundError: + pass + + if stream: + with open( + download_destination + "/" + local_filename, + "wb", + ) as f: + for chunk in result.iter_content(chunk_size=8192): + # If you have chunk encoded response uncomment if + # and set chunk_size parameter to None. + # if chunk: + f.write(chunk) + else: + with open( + download_destination + "/" + local_filename, + "wb", + ) as f: + f.write(result.content) + logging.info("file %s downloaded", local_filename) + return local_filename + + + +def download_files(config): + """Download all the files that have not yet been downloaded""" + conn = config["conn"] + creator = config["creator"] + cookies = cookie_dict(config) + download_destination = config["download_destination"] + force = config["force"] + + result = conn.execute( + "select * from files where downloaded_at is null and creator = :creator", + {"creator": creator}, + ) + + row = result.fetchone() + while row: + link = row[0] + try: + file_name = download_file(link, cookies, download_destination, force) + if not file_name: + file_name = link + downloaded(conn, link) + logging.info( + "Downloaded %s %d to go", + file_name, + count_files_to_download(conn, creator), + ) + except ( + requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError, + ValueError + ) as e: + logging.error("Could not download %s", link) + #logging.error(e.msg) + row = result.fetchone() + + +def downloaded(conn, file): + """Updae the database to show the file has been downloaded""" + cursor = conn.cursor() + cursor.execute( + "update files set downloaded_at = :now where href = :href", + {"now": datetime.datetime.now().timestamp(), "href": file}, + ) + conn.commit() + + +def file_error(status_code, link): + """Dead code?""" + global file_errors + + file_errors.append([status_code, link]) + + +def find_posts(config, start_link): + """The non-recursive way to walk the tree. Soon to be the only way used""" + conn = config["conn"] + creator = config["creator"] + + #import bpdb;bpdb.set_trace() + if not_visited(conn, start_link) and not in_unvisited( + conn, start_link + ): + add_to_unvisited(conn, start_link, creator) + + while unvisited_count(conn): + link = get_next_unvisited(conn) + + visit_link(link, config) + + logging.info( + "Visited %d links, %d files to download", + visited_links(conn, creator), + count_files_to_download(conn, creator), + ) + + +def count_files_to_download(conn, creator): + """Get the count of files yet to be downloaded""" + return conn.execute( + "select count(*) from files where creator = :creator and downloaded_at is null", + {"creator": creator}, + ).fetchone()[0] ## or downloaded_at < max_age") + + +def get_campaign_details(driver): + """Get the name and id of the campaign""" + campaign_data = driver.execute_script( + "return window.patreon.bootstrap.campaign.data" + ) + campaign_id = campaign_data["id"] + + try: + campaign_name = campaign_data["attributes"]["name"] + except KeyError: + campaign_name = None + + return campaign_id, campaign_name + + +def get_filename_from_header(headers): + """ + Get the filename from the headers. + + This is quite a nasty hack, but stackoverflow didn't have anything better + + """ + content_disposition = headers["Content-Disposition"] + + # e.g.: attachment; filename="Czepeku Map Contest #1.zip"; + # filename*=utf-8\'\'Czepeku%20Map%20Contest%20%231.zip' + + # Split on semicolons + filename = None + fields = content_disposition.split(";") + for field in fields: + field = field.strip() + + # Find the right field + if field.startswith("filename="): + filename = field.split("=")[1] + + # Strip surrounding quotes + filename = filename[1:-1] + break + + if filename is None: + raise ValueError("Could not find filename") + + return filename + + +def get_next_unvisited(conn): + """Get the next unvisited post""" + return conn.execute( + "select href from unvisited_links limit 1" + ).fetchone()[0] + + +def get_page(driver, url): + """ + Use selenium to get the page. + + Unfortunately patreon uses javascript to populate the page so we have to + do a little dance to make sure all the content has been loaded. + + This has some significant performance implications, so it would be nice + to figure out a better way than repeatedly polling until the page stops + changing in size. + """ + + delta = 0 + last_size = 0 + same_count = 0 + n = 0 + driver.get(url) + while 1: + text = driver.page_source + size = len(text) + + delta = size - last_size + last_size = size + n += 1 + if delta == 0: + same_count += 1 + else: + same_count = 0 + + if same_count >= 100: + return text + + # Change to be time based + if n > 1000: + raise RuntimeError("timeout") + + logging.debug( + "Waiting for page to stabilise delta: %d same_count: %d n: %d", + delta, + same_count, + n, + ) + + return text + + +def in_files(conn, href): + """Check to see of the specified file is already in the table.""" + return ( + conn.execute( + "select count(*) from files where href = :href", + {"href": href}, + ).fetchone()[0] + > 0 + ) + + +def in_unvisited(conn, link): + """ + Check to see if the specified link is in the list of unvisited links + already + """ + return ( + conn.execute( + "select count(*) from unvisited_links where href = :link", + {"link": link}, + ).fetchone()[0] + != 0 + ) + + +def initialise_database(db_path): + """Initialise the database tables if needed""" + conn = sqlite3.connect(db_path, autocommit=False) + + conn.execute( + """create table if not exists visited_links( + href char(400) not null primary key, + creator text not null, + visited_at datetime not null, + error text + ) + """ + ) + + conn.execute( + """create table if not exists unvisited_links( + href char(400) not null primary key, + creator char(100) not null , + added_at datetime not null) + """ + ) + + conn.execute( + """create table if not exists files( + href char(400) primary key not null, + creator char not null, + file_name text, + file_size integer, + added_at datetime not null, + downloaded_at datetime, + file_type text) + """ + ) + + return conn + + +def login_to_patreon(config, driver): + """Set the cookies so we are logged in. Need to load a page first.""" + cookies = cookie_dict(config) + url = f"https://www.patreon.com/{config['creator']}/posts" + logging.debug("loading login page") + driver.get(url) + + campaign_data = driver.execute_script( + "return window.patreon.bootstrap.campaign.data" + ) + config["campaign_id"] = campaign_data["id"] + config["campaign_name"] = campaign_data["attributes"]["name"] + + for key, value in cookies.items(): + driver.add_cookie({"name": key, "value": value}) + + logging.debug("Logged in") + + +def not_visited(conn, link): + """Check to see if a link has not been visited""" + return ( + conn.execute( + "select count(*) from visited_links where href = :link", + {"link": link}, + ).fetchone()[0] + == 0 + ) + +def remove_creator_home(conn, creator): + creator = f"https://www.patreon.com/{creator}/" + pages = ["posts", "collections"] + + for page in pages: + link = creator + page + conn.execute("delete from visited_links where href = :link", {"link":link}) + conn.commit() + + +def seen_posts(conn, creator): + """Count the number of seen posts, both visited and unvisited""" + count_visited = conn.execute( + "select count(*) from visited_links where creator = :creator", {"creator":creator} + ).fetchone()[0] + count_unvisited = conn.execute( + "select count(*) from unvisited_links where creator = :creator", {"creator":creator} + ).fetchone()[0] + + return count_visited + count_unvisited + + +def unvisited(conn, link): + """Check whether the specified link is unvisited""" + return ( + conn.execute( + "select count(*) from unvisited_links where href = :link", + {"link": link}, + ).fetchone()[0] + != 0 + ) + + +def unvisited_count(conn): + """Count the number of unvisited links""" + result = conn.execute("select count(*) from unvisited_links") + return result.fetchone()[0] + + +def visit_link(href, config): + """ Visit a link""" + # pylint:disable=too-many-locals + logging.info("Processing %s", href) + driver = config["driver"] + campaign_id = config["campaign_id"] + creator = config["creator"] + conn = config["conn"] + + if ( + config["max_files"] + and count_files_to_download(conn, creator) > config["max_files"] + ): + return + + try: + text = get_page(driver, href) + except WebDriverException: + error = ( + f"WebDriverException occurres while trying to load {href}" + ) + logging.error(error) + + add_to_visited(conn, href, creator, error=error) + return + + try: + page_campaign_id, _ = get_campaign_details(driver) + except JavascriptException: + error = ( + "Javascript exception when trying to get page details - " + "error page?" + ) + logging.warning(error) + add_to_visited(conn, href, creator, error=error) + return + + if page_campaign_id != campaign_id: + error = f"This url {href} is not a post belonging to the specified creator" + logging.info(error) + add_to_visited(conn, href, page_campaign_id, error=error) + return + + bs = BeautifulSoup(text, "html.parser") + links = bs.find_all("a") + + _posts = [a for a in links if "href" in a.attrs and "/posts" in a.attrs["href"]] + _collections = [a for a in links if "href" in a.attrs and "/collection" in a.attrs["href"]] + + _posts += _collections + + _files = [ + a + for a in links + if "href" in a.attrs and a.attrs["href"].startswith("https://www.patreon.com/file") + ] + + for file in _files: + logging.debug("adding file %s", file.attrs["href"]) + add_file(conn, file.attrs["href"], creator) + + logging.info( + "Found %d files so far, %d downloaded", + count_files(conn, creator), + count_files_downloaded(conn, creator), + ) + + for post in _posts: + link = post.attrs["href"] + if not link.startswith("https://"): + link = "https://www.patreon.com" + link + + if not link.startswith("https://www.patreon.com") or link.startswith("https:///patreon.com"): + logging.info("%s isn't a patreon link 0 skipping", link) + + # Lets not foillow links to other creators + + match = re.match("https://www.patreon.com/(.+)/posts[/?]", link) + if match: + if match.group(1) != config["creator"]: + logging.info( + "This url (%s) looks like a link to a different creator " + "so skipping", + link, + ) + continue + + if already_seen(conn, link): + logging.debug("Already seen %s - skipping", link) + continue + + add_to_unvisited(conn, link, creator) + + download_files(config) + + add_to_visited(conn, href, creator) + logging.info( + "Seen %d posts %d still to visit", + seen_posts(conn, creator), + count_posts_to_visit(conn, creator), + ) + + +def visited(conn, link): + """Check whether a particular link has been visited""" + return ( + conn.execute( + "select count(*) from visited_links where href = :link", + {"link": link}, + ).fetchone()[0] + != 0 + ) + + +def visited_links(conn, creator): + """ Get the counf of visited links""" + return conn.execute( + "select count(*) from visited_links where creator = :creator", {"creator": creator} + ).fetchone()[0] + + +def walk_tree( + driver, config, campaign_id, href, cookies, seen_posts, files, depth +): + """Walk the tree of all posts for the campaign""" + # pylint:disable=too-many-arguments,too-many-locals + logging.info("Processing %s", href) + logging.info("Depth %d", depth) + depth += 1 + + files_found = len(list(files.keys())) + + if config["max_files"] and files_found > config["max_files"]: + return + + logging.info("Found %d files so far...", files_found) + + try: + text = get_page(driver, href) + except WebDriverException: + logging.error( + "WebDriverException occurred while trying to load %s", href + ) + return + + try: + page_campaign_id, _ = get_campaign_details(driver) + except JavascriptException: + logging.warning( + "Javascript exception when trying to get page details - " + "error page?" + ) + return + + if page_campaign_id != campaign_id: + logging.info( + "This url (%s) is not a post belonging to the specified creator", + href, + ) + return + + bs = BeautifulSoup(text, "html.parser") + links = bs.find_all("a") + del bs + + _posts = [a for a in links if "/posts" in a.attrs["href"]] + + _files = [ + a + for a in links + if a.attrs["href"].startswith("https://www.patreon.com/file") + ] + + for file in _files: + logging.debug("adding file %s", file.attrs["href"]) + files[file.attrs["href"]] = file + + logging.info("Found %d files so far", len(files)) + + for post in _posts: + href = post.attrs["href"] + if not href.startswith("https://www.patreon.com"): + href = "https://www.patreon.com" + href + + # Lets not foillow links to other creators + + match = re.match("https://www.patreon.com/(.+)/posts[/?]", href) + if match: + if match.group(1) != config["creator"]: + logging.info( + "This url (%s) looks like a link to a different creator " + "so skipping", + href, + ) + continue + + if href in seen_posts: + logging.debug("Already seen %s - skipping", href) + continue + + seen_posts[href] = post + + logging.info("Seen %d posts", len(seen_posts)) + + keys = seen_posts.keys() + logging.debug(keys) + + walk_tree( + driver, + config, + campaign_id, + href, + cookies, + seen_posts, + files, + depth, + ) + + +@click.command() +@click.option( + "--creator", + required=True, + help="Text id of creator.", + envvar="PDL_CREATOR", +) +@click.option( + "--device-id", + default="491a6a67-1893-4474-9aad-a0dc82977cf3", + envvar="PDL_DEVICE_ID", +) +@click.option( + "--country-code", required=True, envvar="PDL_COUNTRY_CODE" +) +@click.option("--locale", required=True, envvar="PDL_LOCALE") +@click.option("--currency", required=True, envvar="PDL_CURRENCY") +@click.option("--session-id", required=True, envvar="PDL_SESSION_ID") +@click.option( + "--analytics-session-id", + required=True, + envvar="PDL_ANALYTICS_SESSION_ID", +) +@click.option("--cf-bm", envvar="PDL_CF_BM", required=True) +@click.option("--log-level", default="WARNING") +@click.option("-d", "--download-destination", required=True) +@click.option("--dry-run", is_flag=True) +@click.option("--force", is_flag=True) +@click.option("--max-files", default="0") +@click.option("--file") +@click.option("--db-path", default="./pdl_downloader.sql3") +def main( + creator, + device_id, + country_code, + locale, + currency, + session_id, + analytics_session_id, + cf_bm, + log_level, + dry_run, + download_destination, + force, + max_files, + file, + db_path, +): + """Main function""" + # pylint:disable=too-many-arguments,too-many-locals + start_time = datetime.datetime.now() + if not os.path.isdir(download_destination): + raise ValueError("Download path does not exist") + + if not os.access(download_destination, os.W_OK): + raise ValueError("Cannot write to download path") + + download_destination = download_destination + "/" + creator + + if not os.path.exists(download_destination): + os.makedirs(download_destination) + + config = { + "creator": creator, + "device_id": device_id, + "country_code": country_code, + "locale": locale, + "currency": currency, + "session_id": session_id, + "analytics_session_id": analytics_session_id, + "cf_bm": cf_bm, + "download_destination": download_destination, + "dry_run": dry_run, + "force": force, + "max_files": int(max_files), + "db_path": db_path, + } + + logging.basicConfig( + level=log_level, + format="%(levelname)s:%(asctime)s:%(funcName)s:%(message)s", + ) + + logging.info("Download started at %s", start_time) + if file: + download_file( + file, cookie_dict(config), download_destination, force + ) + else: + download_creator(config) + end_time = datetime.datetime.now() + elapsed = end_time - start_time + + logging.info( + "Download finished at %s, elapsed time was %s", + end_time, + elapsed, + )