diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md deleted file mode 100644 index cbc5611..0000000 --- a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md +++ /dev/null @@ -1,17 +0,0 @@ -### Motivation - - -### Change description - - -### Additional Notes - - -### Reviewer checklist - -* [ ] PR address a single concern. -* [ ] PR title and description are properly filled. -* [ ] Changes will be merged in `master`. -* [ ] Changes are covered by tests. -* [ ] Logging is meaningful in case of troubleshooting. -* [ ] History is clean, commit messages are meaningful (see `CONTRIBUTING.md`) and are well formatted. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index c660177..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: CI - -on: [push] - -jobs: - build: - - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [3.7] - - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install -e '.[test]' - - name: Checks for security vulnerabilities - run: | - safety check - - name: Lint - run: | - make lint - - name: Test with pytest - run: | - make test diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..6a47f02 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,52 @@ +name: Publish Python Package + +on: + release: + types: [created] + +permissions: + contents: read + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: pyproject.toml + - name: Install dependencies + run: | + pip install '.[test]' + - name: Run tests + run: | + python -m pytest + deploy: + runs-on: ubuntu-latest + needs: [test] + environment: release + permissions: + id-token: write + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + cache-dependency-path: pyproject.toml + - name: Install dependencies + run: | + pip install setuptools wheel build + - name: Build + run: | + python -m build + - name: Publish + uses: pypa/gh-action-pypi-publish@release/v1 + diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index e160e16..0000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Release package - -on: - push: - tags: - - 'v[0-9].*' - -jobs: - release-package: - runs-on: ubuntu-latest - - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.7' - architecture: 'x64' - - name: Build package - run: | - python -m pip install --upgrade pip - pip install wheel setuptools - python setup.py bdist_wheel - - name: Upload packate to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.pypi_token }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..35a8e94 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,28 @@ +name: Test + +on: [push, pull_request] + +permissions: + contents: read + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: pyproject.toml + - name: Install dependencies + run: | + pip install '.[test]' + - name: Run tests + run: | + python -m pytest + diff --git a/Makefile b/Makefile index 3691f7d..2f84e74 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,9 @@ .PHONY: sources = ifq tests -line_length = 79 -black_options = --line-length=${line_length} ${sources} -isort_options = --line-length=${line_length} --py 39 --profile black ${sources} +line_length = 80 +black_options = --line-length=${line_length} ifq tests +isort_options = **/*.py -l ${line_length} lint: lint-black lint-isort lint-flake8 ## Lint the project on the host diff --git a/README.md b/README.md index 7392b22..3b832e0 100644 --- a/README.md +++ b/README.md @@ -1,101 +1,68 @@ -# IFQ +# ifq -Library to download www.ilfattoquotidiano.it issues in PDF. +[![PyPI](https://img.shields.io/pypi/v/ifq.svg)](https://pypi.org/project/ifq/) +[![Changelog](https://img.shields.io/github/v/release/zmoog/ifq?include_prereleases&label=changelog)](https://github.com/zmoog/ifq/releases) +[![Tests](https://github.com/zmoog/ifq/actions/workflows/test.yml/badge.svg)](https://github.com/zmoog/ifq/actions/workflows/test.yml) +[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/zmoog/ifq/blob/master/LICENSE) -```python -from datetime import date -from ifq import Scraper +CLI tool and Python library to download PDF issues of ilfattoquotidiano.it -username = '' # your ifq username -password = '' # your ifq password +## Installation -scraper = Scraper(username, password) +Install this tool using `pip`: -path_to_pdf_file = scraper.download_pdf(date.today()) +```bash +pip install ifq ``` -## Getting Started +## Usage -These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system. +Download the IFQ issue for Jan, 2nd 2025: -### Prerequisites +```sh +# Requires a valid subscription to the newspaper +export IFQ_USERNAME="[your username]" +export IFQ_PASSWORD="[your password]" -What things you need to install the software and how to install them +$ ifq issues download 2025-01-02 -``` -Give examples -``` - -### Installing - -A step by step series of examples that tell you how to get a development env running +Downloading issue for 2025-01-02 00:00:00 to /Users/zmoog/code/projects/zmoog/ifq +Downloaded issue to /Users/zmoog/code/projects/zmoog/ifq/2025-01-02.pdf -Say what the step will be - -``` -Give the example +$ file 2025-01-02.pdf +2025-01-02.pdf: PDF document, version 1.7 ``` -And repeat +For help, run: -``` -until finished +```bash +ifq --help ``` -End with an example of getting some data out of the system or using it for a little demo - -## Running the tests - -Explain how to run the automated tests for this system - -### Break down into end to end tests - -Explain what these tests test and why +You can also use: -``` -Give an example +```bash +python -m ifq --help ``` -### And coding style tests +## Development -Explain what these tests test and why +To contribute to this tool, first checkout the code. Then create a new virtual environment: +```bash +cd ifq +python -m venv venv +source venv/bin/activate ``` -Give an example -``` - -## Deployment -Add additional notes about how to deploy this on a live system +Now install the dependencies and test dependencies: -## Built With - -* [Dropwizard](http://www.dropwizard.io/1.0.2/docs/) - The web framework used -* [Maven](https://maven.apache.org/) - Dependency Management -* [ROME](https://rometools.github.io/rome/) - Used to generate RSS Feeds - -## Contributing - -Please read [CONTRIBUTING.md](https://gist.github.com/PurpleBooth/b24679402957c63ec426) for details on our code of conduct, and the process for submitting pull requests to us. - -## Versioning - -We use [SemVer](http://semver.org/) for versioning. For the versions available, see the [tags on this repository](https://github.com/your/project/tags). - -## Authors - -* **Billie Thompson** - *Initial work* - [PurpleBooth](https://github.com/PurpleBooth) - -See also the list of [contributors](https://github.com/your/project/contributors) who participated in this project. - -## License - -This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details - -## Acknowledgments +```bash +pip install -e '.[test]' +``` -* Hat tip to anyone whose code was used -* Inspiration -* etc +To run the tests: -https://gist.github.com/PurpleBooth/109311bb0361f32d87a2 \ No newline at end of file +```bash +python -m pytest +``` diff --git a/ifq/__init__.py b/ifq/__init__.py index 3f37929..75fd49b 100644 --- a/ifq/__init__.py +++ b/ifq/__init__.py @@ -1,12 +1,15 @@ import logging +import os import tempfile from datetime import date +from pathlib import Path import requests from lxml import html IFQ_LOGIN_URL = "https://shop.ilfattoquotidiano.it/login/" IFQ_ARCHIVE_URL = "https://shop.ilfattoquotidiano.it/archivio-edizioni/" +IFQ_MIN_CONTENT_LENGTH = 9000000 class Scraper: @@ -20,7 +23,9 @@ def __init__(self, username: str, password: str): self.username = username self.password = password - def download_pdf(self, pub_date: date) -> str: + def download_pdf( + self, pub_date: date, output_dir: Path = Path.cwd() + ) -> Path: """Download a IFQ issues from the website archive. Scrape and download the issue published at the give pub_date and @@ -67,7 +72,7 @@ def download_pdf(self, pub_date: date) -> str: ] if len(logged_in_cookies) < 1: self.logger.error("login failed") - raise LoginFailed("Cannot login") + raise LoginError("Cannot login") self.logger.info("getting archive page") # open the archive page and get the nonce @@ -86,12 +91,28 @@ def download_pdf(self, pub_date: date) -> str: IFQ_ARCHIVE_URL, data=edition_payload, stream=True ) + self.logger.debug(f"status code: {resp.status_code}") + self.logger.debug(f"headers: {resp.headers}") + self.logger.debug(f"content length: {len(resp.content)}") + if resp.status_code != 200: - self.logger.error(f"status code ${resp.status_code}") - raise IssueNotAvailable() + raise IssueNotAvailableError( + f"expected status code 200, got ${resp.status_code}" + ) + + if resp.headers["Content-Type"] != "application/pdf": + raise DownloadError( + f"expected 'application/pdf', got '{resp.headers['Content-Type']}'" + ) + + if len(resp.content) < IFQ_MIN_CONTENT_LENGTH: + raise DownloadError( + f"expected at least {IFQ_MIN_CONTENT_LENGTH} bytes, got {len(resp.content)}" + ) - self.logger.info("copying the PDF bytes into a temporary file") - # copy the PDF bytes into a temporary file + self.logger.debug("copying the PDF bytes into a temporary file") + + # create a temporary file to store the PDF bytes file = tempfile.NamedTemporaryFile(delete=False) with file as f: @@ -101,13 +122,24 @@ def download_pdf(self, pub_date: date) -> str: f.write(chunk) f.flush() - self.logger.info(f"PDF file available at ${file.name}") - return file.name + # rename the file to the output directory if specified + output_file = os.path.join( + output_dir, pub_date.strftime("%Y-%m-%d") + ".pdf" + ) + os.rename(file.name, output_file) + + self.logger.info(f"PDF file available at ${output_file}") + + return output_file + + +class IssueNotAvailableError(Exception): + pass -class IssueNotAvailable(Exception): +class LoginError(Exception): pass -class LoginFailed(Exception): +class DownloadError(Exception): pass diff --git a/ifq/__main__.py b/ifq/__main__.py new file mode 100644 index 0000000..98dcca0 --- /dev/null +++ b/ifq/__main__.py @@ -0,0 +1,4 @@ +from .cli import cli + +if __name__ == "__main__": + cli() diff --git a/ifq/cli.py b/ifq/cli.py new file mode 100644 index 0000000..09ed806 --- /dev/null +++ b/ifq/cli.py @@ -0,0 +1,61 @@ +import datetime +from pathlib import Path + +import click + +from ifq import Scraper + + +@click.group() +@click.version_option() +@click.option( + "--username", + required=True, + envvar="IFQ_USERNAME", +) +@click.option( + "--password", + required=True, + envvar="IFQ_PASSWORD", +) +@click.pass_context +def cli(ctx: click.Context, username: str, password: str): + "CLI tool and Python library to download PDF issues of ilfattoquotidiano.it" + ctx.ensure_object(dict) + ctx.obj = { + "username": username, + "password": password, + } + + +@cli.group(name="issues") +def issues(): + "Issues related commands" + pass + + +@issues.command(name="download") +@click.argument( + # https://click.palletsprojects.com/en/stable/arguments/ + "pub_date", + type=click.DateTime(formats=["%Y-%m-%d"]), +) +@click.option( + "-o", + "--output-dir", + type=click.Path(file_okay=False, writable=True, resolve_path=True), + help="Output directory", + default=Path.cwd(), +) +@click.pass_context +def download(ctx: click.Context, pub_date: datetime.date, output_dir: Path): + "Download issue as PDF file" + click.echo(f"Downloading issue for {pub_date} to {output_dir}") + + try: + scraper = Scraper(ctx.obj["username"], ctx.obj["password"]) + file = scraper.download_pdf(pub_date, output_dir) + except Exception as e: + raise click.ClickException(f"failed to download issue: {e}") + + click.echo(f"Downloaded issue to {file}") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f1a6279 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,32 @@ +[project] +name = "ifq" +version = "0.4" +description = "CLI tool and Python library to download PDF issues of ilfattoquotidiano.it" +readme = "README.md" +authors = [{name = "Maurizio Branca"}] +license = {text = "Apache-2.0"} +requires-python = ">=3.8" +classifiers = [ + "License :: OSI Approved :: Apache Software License" +] +dependencies = [ + "click", + "requests", + "lxml", +] + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project.urls] +Homepage = "https://github.com/zmoog/ifq" +Changelog = "https://github.com/zmoog/ifq/releases" +Issues = "https://github.com/zmoog/ifq/issues" +CI = "https://github.com/zmoog/ifq/actions" + +[project.scripts] +ifq = "ifq.cli:cli" + +[project.optional-dependencies] +test = ["pytest", "black", "flake8", "isort"] diff --git a/setup.py b/setup.py deleted file mode 100644 index e9a9e02..0000000 --- a/setup.py +++ /dev/null @@ -1,48 +0,0 @@ -from setuptools import setup -import os - -VERSION = "0.3.0" - - -def get_long_description(): - with open( - os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md"), - encoding="utf8", - ) as fp: - return fp.read() - - -setup( - name="ifq", - description="Library to download www.ilfattoquotidiano.it issues in PDF", - long_description=get_long_description(), - long_description_content_type="text/markdown", - author="Maurizio Branca", - url="https://github.com/zmoog/ifq", - project_urls={ - "Issues": "https://github.com/zmoog/ifq/issues", - "CI": "https://github.com/zmoog/ifq/actions", - "Changelog": "https://github.com/zmoog/ifq/releases", - }, - license="Apache License, Version 2.0", - version=VERSION, - packages=["ifq"], - entry_points=""" - [console_scripts] - """, - install_requires=[ - "lxml >= 4.9.1", - "requests >= 2.28.1", - ], - extras_require={ - "test": [ - "black", - "flake8", - "isort", - "pytest", - "safety", - "wheel >= 0.38.3", - ], - }, - python_requires=">=3.7", -) diff --git a/tests/test_ifq.py b/tests/test_ifq.py new file mode 100644 index 0000000..6640704 --- /dev/null +++ b/tests/test_ifq.py @@ -0,0 +1,11 @@ +from click.testing import CliRunner + +from ifq.cli import cli + + +def test_version(): + runner = CliRunner() + with runner.isolated_filesystem(): + result = runner.invoke(cli, ["--version"]) + assert result.exit_code == 0 + assert result.output.startswith("cli, version ") diff --git a/tests/test_import.py b/tests/test_import.py deleted file mode 100644 index d3fef93..0000000 --- a/tests/test_import.py +++ /dev/null @@ -1,4 +0,0 @@ -def test_import(): - from ifq import Scraper - - _ = Scraper(username="test", password="")