Merge branch 'develop' into Add-txyz-ai-card

arXiv · Sep 19, 2024 · 425a66e · 425a66e
2 parents 618b766 + a46202d
commit 425a66e
Show file tree

Hide file tree

Showing 149 changed files with 8,165 additions and 46,568 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -4,10 +4,11 @@
 # UIs for browse.
 
 FROM python:3.11.8-bookworm
-RUN apt-get update && apt-get -y upgrade
 
-ARG git_commit
+# Do not update+upgrade. The base image is kept up to date.  Also destroys
+# ability to cache.
 
+ARG git_commit
 
 ENV PYTHONFAULTHANDLER=1 \
     PYTHONUNBUFFERED=1 \
@@ -16,14 +17,11 @@ ENV PYTHONFAULTHANDLER=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=on \
     PIP_DEFAULT_TIMEOUT=100 \
     POETRY_VERSION=1.3.2 \
-    TRACE=1 \
     LC_ALL=en_US.utf8 \
-    LANG=en_US.utf8 \
-    APP_HOME=/app
+    LANG=en_US.utf8
 
 WORKDIR /app
 
-
 RUN apt-get -y install default-libmysqlclient-dev
 
 ENV VIRTUAL_ENV=/opt/venv
@@ -33,38 +31,30 @@ RUN pip install -U pip "poetry==$POETRY_VERSION"
 
 COPY poetry.lock pyproject.toml ./
 RUN poetry config virtualenvs.create false && \
-    poetry install --no-interaction --no-ansi
-
-RUN pip install "gunicorn==20.1.0"
+    poetry install --no-interaction --no-ansi \
+    --without dev
 
 ADD app.py /app/
 
 ENV PATH "/app:${PATH}"
 
 ADD browse /app/browse
-ADD tests /app/tests
 ADD wsgi.py /app/
 
 RUN echo $git_commit > /git-commit.txt
 
 EXPOSE 8080
-ENV LC_ALL en_US.utf8
-ENV LANG en_US.utf8
 ENV LOGLEVEL 40
-ENV FLASK_DEBUG 1
-ENV FLASK_APP /opt/arxiv/app.py
 
 RUN useradd e-prints
-RUN chown e-prints:e-prints /app/tests/data/
-RUN chmod 775 /app/tests/data/
 USER e-prints
 
 # Why is this command in an env var and not just run in CMD?  So it can be used
 # to start a docker instance during an integration test. See
 # cicd/cloudbuild-master-pr.yaml for how it is used
 
 ENV GUNICORN gunicorn --bind :8080 \
-    --workers 1 --threads 8 --timeout 0 \
+    --workers 4 --threads 8 --timeout 0 \
      "browse.factory:create_web_app()"
 
-CMD exec $GUNICORN
+CMD exec $GUNICORN
diff --git a/README.md b/README.md
@@ -8,17 +8,21 @@ You can run the browse app directly.
 make venv
 ````
 
+(the make rarely works)
+
 or 
 
 ```bash
 python --version
-# 3.10.x
+# 3.11.x
 python -m venv ./venv
 source ./venv/bin/activate
 pip install poetry==1.3.2
 poetry install
 python main.py
 ```
+Note -- make sure you have python dev installed befoore doing the above steps, or the `poetry install` will fail trying to build the mySQL library dependency. E.g.: `sudo apt-get install python3.11-dev`
+
 Then go to http://127.0.0.1:8080/abs/0906.5132
 
 This will monitor for any changes to the Python code and restart the server.
@@ -37,8 +41,7 @@ First, you'd need to create the '.env' file somewhere. Using tests/.env is sugge
     export GOOGLE_APPLICATION_CREDENTIALS=<Your SA credential>
     export BROWSE_SQLALCHEMY_DATABASE_URI="mysql://browse:<BROWSE_PASSWORD>@127.0.0.1:1234/arXiv"
     export DOCUMENT_ABSTRACT_SERVICE=browse.services.documents.db_docs
-    export DOCUMENT_LATEST_VERSIONS_PATH=gs://arxiv-production-data/ftp
-    export DOCUMENT_ORIGINAL_VERSIONS_PATH=gs://arxiv-production-data/orig
+    export ABS_PATH_ROOT=gs://arxiv-production-data
     export DOCUMENT_CACHE_PATH=gs://arxiv-production-data/ps_cache
     export DOCUMENT_LISTING_PATH=gs://arxiv-production-data/ftp
     export DISSEMINATION_STORAGE_PREFIX=gs://arxiv-production-data
@@ -155,7 +158,7 @@ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
 pytest tests
 ```
 
-### Settinp up pytest in PyCharm
+### Setting up pytest in PyCharm
 
 ![docs/development/pycharm-run-setting.png](docs/development/pycharm-pytest.png)
 

diff --git a/browse/commands/check_paper_formats.py b/browse/commands/check_paper_formats.py
@@ -8,7 +8,7 @@
 import click
 
 from arxiv.identifier import Identifier
-from arxiv.db import session
+from arxiv.db import Session
 from arxiv.db.models import Metadata
 from browse.services.dissemination import get_article_store, ArticleStore
 from arxiv.formats import formats_from_source_flag
@@ -19,7 +19,7 @@
 @click.argument("yymm")
 def check_paper_formats(yymm: str) -> None:
     """Checks formats for yymm."""
-    query = (session.query(Metadata.paper_id, Metadata.version,
+    query = (Session.query(Metadata.paper_id, Metadata.version,
                             Metadata.source_format,
                             Metadata.source_flags, Metadata.source_size)
                 .filter(or_(Metadata.paper_id.like(f"%/{yymm}%"),
@@ -44,7 +44,7 @@ def check_paper_formats(yymm: str) -> None:
             result["src_file_problem"] = ""
             result["source_flag_only_formats"] = formats_from_source_flag(source_flags)
 
-            fileobj, fmt, docmeta, version = src
+            fileobj, docmeta, version = src
             if isinstance(fileobj, FileObj):
                 result["sizes_match"] = source_size == fileobj.size
                 result["fs_size"] = fileobj.size

diff --git a/browse/commands/invalidate.py b/browse/commands/invalidate.py
@@ -9,7 +9,7 @@
 from google.cloud import compute_v1
 from sqlalchemy.orm import scoped_session
 
-from arxiv.db import session
+from arxiv.db import Session
 from arxiv.db.models import NextMail
 
 bp = Blueprint("invalidate", __name__)
@@ -39,14 +39,13 @@ def invalidate_mailings(project: str, cdn: str, mailings: List[str], dry_run: bo
         raise ValueError("mailings values must be like '230130'")
 
     paths: List[str] = []
-    session: scoped_session = session
     for mailing in mailings:
         if v:
             print(f"About to query for {mailing}")
-        papers = (session.query(NextMail.paper_id, NextMail.version)
+        papers = (Session.query(NextMail.paper_id, NextMail.version)
                   .filter(NextMail.mail_id == int(mailing)))
 
-        nn = 0;
+        nn = 0
         for paper_id, version in papers.all():
             paths.append(f"/pdf/{paper_id}.pdf")
             paths.append(f"/pdf/{paper_id}v{version}.pdf")

diff --git a/browse/config.py b/browse/config.py
@@ -38,7 +38,7 @@ class Settings(arxiv_base.Settings):
     LATEXML_BASE_URL: str = ''
     """Base GS bucket URL to find the HTML."""
 
-    LATEXML_BUCKET: str = 'latexml_arxiv_id_converted'
+    LATEXML_BUCKET: str = './test/data'
 
     SQLALCHEMY_TRACK_MODIFICATIONS: bool = False
     SQLALCHEMY_ECHO: bool = False
@@ -99,20 +99,14 @@ class Settings(arxiv_base.Settings):
 
     Accepted values are:
     - `browse.services.documents.fs_docs`: DocMetadata using .abs files. Used in
-       production since 2019. If set DOCUMENT_LATEST_VERSIONS_PATH,
-       DOCUMENT_ORIGINAL_VERSIONS_PATH and DOCUMENT_CACHE_PATH need to be set.
+       production since 2019. If set ABS_PATH_ROOT needs to be set.
     - `browse.services.documents.db_docs`: DocMetadata using the database.
     """
 
-    DOCUMENT_LATEST_VERSIONS_PATH: str = "tests/data/abs_files/ftp"
+    ABS_PATH_ROOT: str = "tests/data/abs_files/"
     """Paths to .abs and source files.
 
-        This can start with gs:// to use Google Storage."""
-    DOCUMENT_ORIGINAL_VERSIONS_PATH: str = "tests/data/abs_files/orig"
-    """Paths to .abs and source files.
-
-        This can start with gs:// to use Google Storage.
-    """
+       This can start with gs:// to use Google Storage."""
     DOCUMENT_CACHE_PATH: str = "tests/data/cache"
     """Path to cache directory"""
 
@@ -126,13 +120,16 @@ class Settings(arxiv_base.Settings):
     `./testing/data/` for testing data. Must end with a /
     """
 
-    GENPDF_API_URL: str = "https://genpdf-api.arxiv.org"
-    """URL of the genpdf API"""
+    GENPDF_API_URL: str = ""
+    """URL of the genpdf API. https://genpdf-api.arxiv.org"""
+
+    GENPDF_SERVICE_URL: str = ""
+    """URL of the genpdf service URL. This is the original service URL on the cloud run."""
 
     GENPDF_API_TIMEOUT: int = 590
     """Time ouf for the genpdf API access"""
 
-    GENPDF_API_STORAGE_PREFIX: str = "./tests/data/"
+    GENPDF_API_STORAGE_PREFIX: str = "./tests/data/abs_files"
     """Where genpdf stores the PDFs. It is likely the local file system does not work here but
     it is plausible to match the gs bucket with local file system, esp. for testing.
     For production, it would be:
@@ -352,19 +349,10 @@ def check(self) -> None:
                 "Using sqlite in CLASSIC_DB_URI in production environment"
             )
 
-        if (self.DOCUMENT_ORIGINAL_VERSIONS_PATH.startswith("gs://")
-                and self.DOCUMENT_LATEST_VERSIONS_PATH.startswith("gs://")):
+        if self.ABS_PATH_ROOT.startswith("gs://"):
             self.FS_TZ = "UTC"
-            log.warning("Switching FS_TZ to UTC since DOCUMENT_LATEST_VERSIONS_PATH "
-                        "and DOCUMENT_ORIGINAL_VERSIONS_PATH are Google Storage")
+            log.warning("Switching FS_TZ to UTC since ABS_PATH_ROOT is Google Storage")
             if os.environ.get('GOOGLE_APPLICATION_CREDENTIALS', ''):
                 log.warning("GOOGLE_APPLICATION_CREDENTIALS is set")
             else:
                 log.warning("GOOGLE_APPLICATION_CREDENTIALS is not set")
-
-        if ("fs_docs" in str(type(self.DOCUMENT_ABSTRACT_SERVICE)) and
-                "fs_listing" in str(type(self.DOCUMENT_LISTING_PATH)) and
-                self.DOCUMENT_LATEST_VERSIONS_PATH != self.DOCUMENT_LISTING_PATH):
-            log.warning(f"Unexpected: using FS listings and abs service but FS don't match. "
-                        "latest abs at {self.DOCUMENT_LATEST_VERSIONS_PATH} "
-                        f"but listings at {self.DOCUMENT_LISTING_PATH}")
diff --git a/browse/controllers/__init__.py b/browse/controllers/__init__.py
@@ -3,8 +3,7 @@
 Each controller corresponds to a distinct browse feature with its own
 request handling logic.
 """
-from datetime import timezone, datetime, timedelta
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, List
 from zoneinfo import ZoneInfo
 
 from http import HTTPStatus as status

diff --git a/browse/controllers/abs_page.py b/browse/controllers/abs_page.py
@@ -9,15 +9,12 @@
 from urllib.parse import urljoin
 
 from http import HTTPStatus as status
-
-from arxiv.base import logging
 from dateutil import parser
 from dateutil.tz import tzutc
 from flask import request, url_for
 from werkzeug.exceptions import InternalServerError
 
-from browse.controllers import check_supplied_identifier
-
+from arxiv.base import logging
 from arxiv.taxonomy.definitions import ARCHIVES, CATEGORIES
 from arxiv.taxonomy.category import Category
 from arxiv.identifier import (
@@ -32,6 +29,8 @@
     AbsNotFoundException,
     AbsVersionNotFoundException,
 )
+from arxiv.integration.fastly.headers import add_surrogate_key
+
 from browse.exceptions import AbsNotFound
 from browse.services.database import (
     count_trackback_pings,
@@ -43,7 +42,7 @@
 )
 from browse.services.documents import get_doc_service
 from browse.services.dissemination import get_article_store
-
+from browse.controllers import check_supplied_identifier
 from browse.formatting.external_refs_cits import (
     DBLP_BASE_URL,
     DBLP_BIBTEX_PATH,
@@ -97,7 +96,7 @@ def get_abs_page(arxiv_id: str) -> Response:
 
         arxiv_id = _check_legacy_id_params(arxiv_id)
         arxiv_identifier = Identifier(arxiv_id=arxiv_id)
-
+        response_headers=add_surrogate_key(response_headers,[f"abs-{arxiv_identifier.id}", f"paper-id-{arxiv_identifier.id}"])
         redirect = check_supplied_identifier(arxiv_identifier, "browse.abstract")
         if redirect:
             return redirect
@@ -144,6 +143,9 @@ def get_abs_page(arxiv_id: str) -> Response:
                     response_data["higher_version_withdrawn_submitter"] = _get_submitter(abs_meta.arxiv_identifier,
                                                                                          ver.version)
 
+        response_data["encrypted"] = abs_meta.get_requested_version().source_flag.source_encrypted
+
+
         _non_critical_abs_data(abs_meta, arxiv_identifier, response_data)
 
     except AbsNotFoundException as ex:
@@ -305,15 +307,10 @@ def _prevnext_links(
     ):
         context = request.args["context"]
     elif primary_category:
-        pc = primary_category.get_canonical()
-        if not arxiv_identifier.is_old_id:  # new style IDs
-            context = pc.id
-        else:  # Old style id
-            if pc.id in ARCHIVES:
-                context = pc.id
-            else:
-                if arxiv_identifier.archive in ARCHIVES:
-                    context = arxiv_identifier.archive
+        context = primary_category.canonical_id
+    elif arxiv_identifier.is_old_id: 
+        if arxiv_identifier.archive in ARCHIVES: #context from old style id
+                    context=ARCHIVES[arxiv_identifier.archive].canonical_id
 
     response_data["browse_context"] = context
     response_data["browse_context_previous_url"] = url_for(

diff --git a/browse/controllers/archive_page/__init__.py b/browse/controllers/archive_page/__init__.py
@@ -1,7 +1,7 @@
 """Archive landing page."""
 
 import datetime
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, Optional
 from http import HTTPStatus as status
 
 from arxiv.taxonomy.definitions import (
@@ -11,17 +11,20 @@
     CATEGORIES
 )
 from arxiv.taxonomy.category import Category, Archive
+from arxiv.integration.fastly.headers import add_surrogate_key
 
 from browse.controllers import biz_tz
 from browse.controllers.archive_page.by_month_form import ByMonthForm
 from browse.controllers.years_operating import stats_by_year, years_operating
 from browse.controllers.response_headers import abs_expires_header
 
 
-def get_archive(archive_id: str) -> Tuple[Dict[str, Any], int, Dict[str, Any]]:
+def get_archive(archive_id: Optional[str]) -> Tuple[Dict[str, Any], int, Dict[str, Any]]:
     """Gets archive page."""
     data: Dict[str, Any] = {}
     response_headers: Dict[str, Any] = {}
+    response_headers["Surrogate-Control"]="max-age=86400" #one day
+    response_headers=add_surrogate_key(response_headers,["archive"])
 
     if not archive_id or archive_id == "list":
         return archive_index("list", status_in=status.OK)
@@ -35,8 +38,6 @@ def get_archive(archive_id: str) -> Tuple[Dict[str, Any], int, Dict[str, Any]]:
         return archive_index(archive_id,
                                  status_in=status.NOT_FOUND)
 
-    _write_expires_header(response_headers)
-
     if archive.is_active==False: #subsumed archives
         subsuming_category=archive.get_canonical()
         if not isinstance(subsuming_category, Category):
@@ -81,7 +82,9 @@ def archive_index(bad_archive_id: str, status_in: int) -> Tuple[Dict[str, Any],
     data["defunct"] = defunct
 
     data["template"] = "archive/archive_list_all.html"
-    return data, status_in, {}
+    headers: Dict[str,str]={}
+    headers=add_surrogate_key(headers,["archive"])
+    return data, status_in, headers
 
 
 def category_list(archive: Archive) -> List[Category]:

diff --git a/browse/controllers/cookies.py b/browse/controllers/cookies.py
@@ -77,6 +77,7 @@ def get_cookies_page(is_debug: bool) -> Any:
         'cookies_config': selected_options_from_request(copy.deepcopy(cookies_config)),
         'debug': is_debug,
         'controlled_cookies': [cc['name'] for cc in cookies_config],
+        'headers': (request.headers)
     }
     response_headers = {'Expires': '0',
                         'Pragma': 'no-cache'}
@@ -94,7 +95,6 @@ def selected_options_from_request(configs: List[Dict[str, Any]]) -> List[Dict[st
             matching_opt[2] = 1
     return configs
 
-
 def cookies_to_set(req: flask.Request) -> List[Dict[str, object]]:
     """Get cookies from the form and return them as a list of tuples."""
     cts = []