Skip to content

Commit

Permalink
Merge branch 'develop' into Add-txyz-ai-card
Browse files Browse the repository at this point in the history
  • Loading branch information
cbf66 authored Sep 19, 2024
2 parents 618b766 + a46202d commit 425a66e
Show file tree
Hide file tree
Showing 149 changed files with 8,165 additions and 46,568 deletions.
26 changes: 8 additions & 18 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
# UIs for browse.

FROM python:3.11.8-bookworm
RUN apt-get update && apt-get -y upgrade

ARG git_commit
# Do not update+upgrade. The base image is kept up to date. Also destroys
# ability to cache.

ARG git_commit

ENV PYTHONFAULTHANDLER=1 \
PYTHONUNBUFFERED=1 \
Expand All @@ -16,14 +17,11 @@ ENV PYTHONFAULTHANDLER=1 \
PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
POETRY_VERSION=1.3.2 \
TRACE=1 \
LC_ALL=en_US.utf8 \
LANG=en_US.utf8 \
APP_HOME=/app
LANG=en_US.utf8

WORKDIR /app


RUN apt-get -y install default-libmysqlclient-dev

ENV VIRTUAL_ENV=/opt/venv
Expand All @@ -33,38 +31,30 @@ RUN pip install -U pip "poetry==$POETRY_VERSION"

COPY poetry.lock pyproject.toml ./
RUN poetry config virtualenvs.create false && \
poetry install --no-interaction --no-ansi

RUN pip install "gunicorn==20.1.0"
poetry install --no-interaction --no-ansi \
--without dev

ADD app.py /app/

ENV PATH "/app:${PATH}"

ADD browse /app/browse
ADD tests /app/tests
ADD wsgi.py /app/

RUN echo $git_commit > /git-commit.txt

EXPOSE 8080
ENV LC_ALL en_US.utf8
ENV LANG en_US.utf8
ENV LOGLEVEL 40
ENV FLASK_DEBUG 1
ENV FLASK_APP /opt/arxiv/app.py

RUN useradd e-prints
RUN chown e-prints:e-prints /app/tests/data/
RUN chmod 775 /app/tests/data/
USER e-prints

# Why is this command in an env var and not just run in CMD? So it can be used
# to start a docker instance during an integration test. See
# cicd/cloudbuild-master-pr.yaml for how it is used

ENV GUNICORN gunicorn --bind :8080 \
--workers 1 --threads 8 --timeout 0 \
--workers 4 --threads 8 --timeout 0 \
"browse.factory:create_web_app()"

CMD exec $GUNICORN
CMD exec $GUNICORN
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,21 @@ You can run the browse app directly.
make venv
````

(the make rarely works)

or

```bash
python --version
# 3.10.x
# 3.11.x
python -m venv ./venv
source ./venv/bin/activate
pip install poetry==1.3.2
poetry install
python main.py
```
Note -- make sure you have python dev installed befoore doing the above steps, or the `poetry install` will fail trying to build the mySQL library dependency. E.g.: `sudo apt-get install python3.11-dev`

Then go to http://127.0.0.1:8080/abs/0906.5132

This will monitor for any changes to the Python code and restart the server.
Expand All @@ -37,8 +41,7 @@ First, you'd need to create the '.env' file somewhere. Using tests/.env is sugge
export GOOGLE_APPLICATION_CREDENTIALS=<Your SA credential>
export BROWSE_SQLALCHEMY_DATABASE_URI="mysql://browse:<BROWSE_PASSWORD>@127.0.0.1:1234/arXiv"
export DOCUMENT_ABSTRACT_SERVICE=browse.services.documents.db_docs
export DOCUMENT_LATEST_VERSIONS_PATH=gs://arxiv-production-data/ftp
export DOCUMENT_ORIGINAL_VERSIONS_PATH=gs://arxiv-production-data/orig
export ABS_PATH_ROOT=gs://arxiv-production-data
export DOCUMENT_CACHE_PATH=gs://arxiv-production-data/ps_cache
export DOCUMENT_LISTING_PATH=gs://arxiv-production-data/ftp
export DISSEMINATION_STORAGE_PREFIX=gs://arxiv-production-data
Expand Down Expand Up @@ -155,7 +158,7 @@ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
pytest tests
```
### Settinp up pytest in PyCharm
### Setting up pytest in PyCharm
![docs/development/pycharm-run-setting.png](docs/development/pycharm-pytest.png)
Expand Down
6 changes: 3 additions & 3 deletions browse/commands/check_paper_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import click

from arxiv.identifier import Identifier
from arxiv.db import session
from arxiv.db import Session
from arxiv.db.models import Metadata
from browse.services.dissemination import get_article_store, ArticleStore
from arxiv.formats import formats_from_source_flag
Expand All @@ -19,7 +19,7 @@
@click.argument("yymm")
def check_paper_formats(yymm: str) -> None:
"""Checks formats for yymm."""
query = (session.query(Metadata.paper_id, Metadata.version,
query = (Session.query(Metadata.paper_id, Metadata.version,
Metadata.source_format,
Metadata.source_flags, Metadata.source_size)
.filter(or_(Metadata.paper_id.like(f"%/{yymm}%"),
Expand All @@ -44,7 +44,7 @@ def check_paper_formats(yymm: str) -> None:
result["src_file_problem"] = ""
result["source_flag_only_formats"] = formats_from_source_flag(source_flags)

fileobj, fmt, docmeta, version = src
fileobj, docmeta, version = src
if isinstance(fileobj, FileObj):
result["sizes_match"] = source_size == fileobj.size
result["fs_size"] = fileobj.size
Expand Down
7 changes: 3 additions & 4 deletions browse/commands/invalidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from google.cloud import compute_v1
from sqlalchemy.orm import scoped_session

from arxiv.db import session
from arxiv.db import Session
from arxiv.db.models import NextMail

bp = Blueprint("invalidate", __name__)
Expand Down Expand Up @@ -39,14 +39,13 @@ def invalidate_mailings(project: str, cdn: str, mailings: List[str], dry_run: bo
raise ValueError("mailings values must be like '230130'")

paths: List[str] = []
session: scoped_session = session
for mailing in mailings:
if v:
print(f"About to query for {mailing}")
papers = (session.query(NextMail.paper_id, NextMail.version)
papers = (Session.query(NextMail.paper_id, NextMail.version)
.filter(NextMail.mail_id == int(mailing)))

nn = 0;
nn = 0
for paper_id, version in papers.all():
paths.append(f"/pdf/{paper_id}.pdf")
paths.append(f"/pdf/{paper_id}v{version}.pdf")
Expand Down
36 changes: 12 additions & 24 deletions browse/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class Settings(arxiv_base.Settings):
LATEXML_BASE_URL: str = ''
"""Base GS bucket URL to find the HTML."""

LATEXML_BUCKET: str = 'latexml_arxiv_id_converted'
LATEXML_BUCKET: str = './test/data'

SQLALCHEMY_TRACK_MODIFICATIONS: bool = False
SQLALCHEMY_ECHO: bool = False
Expand Down Expand Up @@ -99,20 +99,14 @@ class Settings(arxiv_base.Settings):
Accepted values are:
- `browse.services.documents.fs_docs`: DocMetadata using .abs files. Used in
production since 2019. If set DOCUMENT_LATEST_VERSIONS_PATH,
DOCUMENT_ORIGINAL_VERSIONS_PATH and DOCUMENT_CACHE_PATH need to be set.
production since 2019. If set ABS_PATH_ROOT needs to be set.
- `browse.services.documents.db_docs`: DocMetadata using the database.
"""

DOCUMENT_LATEST_VERSIONS_PATH: str = "tests/data/abs_files/ftp"
ABS_PATH_ROOT: str = "tests/data/abs_files/"
"""Paths to .abs and source files.
This can start with gs:// to use Google Storage."""
DOCUMENT_ORIGINAL_VERSIONS_PATH: str = "tests/data/abs_files/orig"
"""Paths to .abs and source files.
This can start with gs:// to use Google Storage.
"""
This can start with gs:// to use Google Storage."""
DOCUMENT_CACHE_PATH: str = "tests/data/cache"
"""Path to cache directory"""

Expand All @@ -126,13 +120,16 @@ class Settings(arxiv_base.Settings):
`./testing/data/` for testing data. Must end with a /
"""

GENPDF_API_URL: str = "https://genpdf-api.arxiv.org"
"""URL of the genpdf API"""
GENPDF_API_URL: str = ""
"""URL of the genpdf API. https://genpdf-api.arxiv.org"""

GENPDF_SERVICE_URL: str = ""
"""URL of the genpdf service URL. This is the original service URL on the cloud run."""

GENPDF_API_TIMEOUT: int = 590
"""Time ouf for the genpdf API access"""

GENPDF_API_STORAGE_PREFIX: str = "./tests/data/"
GENPDF_API_STORAGE_PREFIX: str = "./tests/data/abs_files"
"""Where genpdf stores the PDFs. It is likely the local file system does not work here but
it is plausible to match the gs bucket with local file system, esp. for testing.
For production, it would be:
Expand Down Expand Up @@ -352,19 +349,10 @@ def check(self) -> None:
"Using sqlite in CLASSIC_DB_URI in production environment"
)

if (self.DOCUMENT_ORIGINAL_VERSIONS_PATH.startswith("gs://")
and self.DOCUMENT_LATEST_VERSIONS_PATH.startswith("gs://")):
if self.ABS_PATH_ROOT.startswith("gs://"):
self.FS_TZ = "UTC"
log.warning("Switching FS_TZ to UTC since DOCUMENT_LATEST_VERSIONS_PATH "
"and DOCUMENT_ORIGINAL_VERSIONS_PATH are Google Storage")
log.warning("Switching FS_TZ to UTC since ABS_PATH_ROOT is Google Storage")
if os.environ.get('GOOGLE_APPLICATION_CREDENTIALS', ''):
log.warning("GOOGLE_APPLICATION_CREDENTIALS is set")
else:
log.warning("GOOGLE_APPLICATION_CREDENTIALS is not set")

if ("fs_docs" in str(type(self.DOCUMENT_ABSTRACT_SERVICE)) and
"fs_listing" in str(type(self.DOCUMENT_LISTING_PATH)) and
self.DOCUMENT_LATEST_VERSIONS_PATH != self.DOCUMENT_LISTING_PATH):
log.warning(f"Unexpected: using FS listings and abs service but FS don't match. "
"latest abs at {self.DOCUMENT_LATEST_VERSIONS_PATH} "
f"but listings at {self.DOCUMENT_LISTING_PATH}")
3 changes: 1 addition & 2 deletions browse/controllers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
Each controller corresponds to a distinct browse feature with its own
request handling logic.
"""
from datetime import timezone, datetime, timedelta
from typing import Any, Dict, Optional, Tuple
from typing import Any, Dict, Optional, Tuple, List
from zoneinfo import ZoneInfo

from http import HTTPStatus as status
Expand Down
27 changes: 12 additions & 15 deletions browse/controllers/abs_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,12 @@
from urllib.parse import urljoin

from http import HTTPStatus as status

from arxiv.base import logging
from dateutil import parser
from dateutil.tz import tzutc
from flask import request, url_for
from werkzeug.exceptions import InternalServerError

from browse.controllers import check_supplied_identifier

from arxiv.base import logging
from arxiv.taxonomy.definitions import ARCHIVES, CATEGORIES
from arxiv.taxonomy.category import Category
from arxiv.identifier import (
Expand All @@ -32,6 +29,8 @@
AbsNotFoundException,
AbsVersionNotFoundException,
)
from arxiv.integration.fastly.headers import add_surrogate_key

from browse.exceptions import AbsNotFound
from browse.services.database import (
count_trackback_pings,
Expand All @@ -43,7 +42,7 @@
)
from browse.services.documents import get_doc_service
from browse.services.dissemination import get_article_store

from browse.controllers import check_supplied_identifier
from browse.formatting.external_refs_cits import (
DBLP_BASE_URL,
DBLP_BIBTEX_PATH,
Expand Down Expand Up @@ -97,7 +96,7 @@ def get_abs_page(arxiv_id: str) -> Response:

arxiv_id = _check_legacy_id_params(arxiv_id)
arxiv_identifier = Identifier(arxiv_id=arxiv_id)

response_headers=add_surrogate_key(response_headers,[f"abs-{arxiv_identifier.id}", f"paper-id-{arxiv_identifier.id}"])
redirect = check_supplied_identifier(arxiv_identifier, "browse.abstract")
if redirect:
return redirect
Expand Down Expand Up @@ -144,6 +143,9 @@ def get_abs_page(arxiv_id: str) -> Response:
response_data["higher_version_withdrawn_submitter"] = _get_submitter(abs_meta.arxiv_identifier,
ver.version)

response_data["encrypted"] = abs_meta.get_requested_version().source_flag.source_encrypted


_non_critical_abs_data(abs_meta, arxiv_identifier, response_data)

except AbsNotFoundException as ex:
Expand Down Expand Up @@ -305,15 +307,10 @@ def _prevnext_links(
):
context = request.args["context"]
elif primary_category:
pc = primary_category.get_canonical()
if not arxiv_identifier.is_old_id: # new style IDs
context = pc.id
else: # Old style id
if pc.id in ARCHIVES:
context = pc.id
else:
if arxiv_identifier.archive in ARCHIVES:
context = arxiv_identifier.archive
context = primary_category.canonical_id
elif arxiv_identifier.is_old_id:
if arxiv_identifier.archive in ARCHIVES: #context from old style id
context=ARCHIVES[arxiv_identifier.archive].canonical_id

response_data["browse_context"] = context
response_data["browse_context_previous_url"] = url_for(
Expand Down
13 changes: 8 additions & 5 deletions browse/controllers/archive_page/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Archive landing page."""

import datetime
from typing import Any, Dict, List, Tuple
from typing import Any, Dict, List, Tuple, Optional
from http import HTTPStatus as status

from arxiv.taxonomy.definitions import (
Expand All @@ -11,17 +11,20 @@
CATEGORIES
)
from arxiv.taxonomy.category import Category, Archive
from arxiv.integration.fastly.headers import add_surrogate_key

from browse.controllers import biz_tz
from browse.controllers.archive_page.by_month_form import ByMonthForm
from browse.controllers.years_operating import stats_by_year, years_operating
from browse.controllers.response_headers import abs_expires_header


def get_archive(archive_id: str) -> Tuple[Dict[str, Any], int, Dict[str, Any]]:
def get_archive(archive_id: Optional[str]) -> Tuple[Dict[str, Any], int, Dict[str, Any]]:
"""Gets archive page."""
data: Dict[str, Any] = {}
response_headers: Dict[str, Any] = {}
response_headers["Surrogate-Control"]="max-age=86400" #one day
response_headers=add_surrogate_key(response_headers,["archive"])

if not archive_id or archive_id == "list":
return archive_index("list", status_in=status.OK)
Expand All @@ -35,8 +38,6 @@ def get_archive(archive_id: str) -> Tuple[Dict[str, Any], int, Dict[str, Any]]:
return archive_index(archive_id,
status_in=status.NOT_FOUND)

_write_expires_header(response_headers)

if archive.is_active==False: #subsumed archives
subsuming_category=archive.get_canonical()
if not isinstance(subsuming_category, Category):
Expand Down Expand Up @@ -81,7 +82,9 @@ def archive_index(bad_archive_id: str, status_in: int) -> Tuple[Dict[str, Any],
data["defunct"] = defunct

data["template"] = "archive/archive_list_all.html"
return data, status_in, {}
headers: Dict[str,str]={}
headers=add_surrogate_key(headers,["archive"])
return data, status_in, headers


def category_list(archive: Archive) -> List[Category]:
Expand Down
2 changes: 1 addition & 1 deletion browse/controllers/cookies.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def get_cookies_page(is_debug: bool) -> Any:
'cookies_config': selected_options_from_request(copy.deepcopy(cookies_config)),
'debug': is_debug,
'controlled_cookies': [cc['name'] for cc in cookies_config],
'headers': (request.headers)
}
response_headers = {'Expires': '0',
'Pragma': 'no-cache'}
Expand All @@ -94,7 +95,6 @@ def selected_options_from_request(configs: List[Dict[str, Any]]) -> List[Dict[st
matching_opt[2] = 1
return configs


def cookies_to_set(req: flask.Request) -> List[Dict[str, object]]:
"""Get cookies from the form and return them as a list of tuples."""
cts = []
Expand Down
Loading

0 comments on commit 425a66e

Please sign in to comment.