Merge pull request #724 from arXiv/ARXIVCE-2412-rebuild-catchup

Arxivce 2412 rebuild catchup
arXiv · Sep 19, 2024 · 54fee52 · 54fee52
2 parents 04a4f4f + 8f88d3d
commit 54fee52
Show file tree

Hide file tree

Showing 13 changed files with 1,437 additions and 82 deletions.
diff --git a/browse/controllers/archive_page/__init__.py b/browse/controllers/archive_page/__init__.py
@@ -1,6 +1,6 @@
 """Archive landing page."""
 
-import datetime
+from datetime import datetime
 from typing import Any, Dict, List, Tuple, Optional
 from http import HTTPStatus as status
 
@@ -48,15 +48,14 @@ def get_archive(archive_id: Optional[str]) -> Tuple[Dict[str, Any], int, Dict[st
         archive = subsuming_category.get_archive()
 
     years = years_operating(archive)
-    data["years"] = years
+    data["years"] = [datetime.now().year, datetime.now().year-1] #only last 90 days allowed anyways
     data["months"] = MONTHS
     data["days"] = DAYS
     data["archive"] = archive
     data["list_form"] = ByMonthForm(archive, years)
     data["stats_by_year"] = stats_by_year(archive, years)
     data["category_list"] = category_list(archive)
-
-    data["catchup_to"] = datetime.date.today() - datetime.timedelta(days=7)
+    data["current_month"] = datetime.now().strftime('%m')
     data["template"] = "archive/single_archive.html"
     return data, status.OK, response_headers
 

diff --git a/browse/controllers/catchup_page.py b/browse/controllers/catchup_page.py
@@ -0,0 +1,256 @@
+"""handles requests to the catchup page.
+Allows users to access something equivalent to the /new page for up to 90 days back
+"""
+import re
+from typing import Tuple, Union, Dict, Any, List
+from datetime import date, datetime, timedelta
+
+from http import HTTPStatus
+from flask import request, redirect, url_for
+from werkzeug.exceptions import BadRequest
+
+from arxiv.document.metadata import DocMetadata
+from arxiv.integration.fastly.headers import add_surrogate_key
+from arxiv.taxonomy.category import Group, Archive, Category
+from arxiv.taxonomy.definitions import CATEGORIES, ARCHIVES, GROUPS, ARCHIVES_ACTIVE
+
+from browse.controllers.archive_page.by_month_form import MONTHS
+from browse.controllers.list_page import latexml_links_for_articles, dl_for_articles, authors_for_articles, sub_sections_for_types, Response
+from browse.services.database.catchup import get_catchup_data, CATCHUP_LIMIT, get_next_announce_day
+
+def get_catchup_page(subject_str:str, date:str)-> Response:
+    """get the catchup page for a given set of request parameters 
+    see process_catchup_params for details on parameters
+    """
+    subject, start_day, include_abs, page=_process_catchup_params(subject_str, date)
+    #check for redirects for noncanon subjects
+    if subject.id != subject.canonical_id:
+        return redirect(
+            url_for('catchup', 
+                    subject=subject.canonical_id, 
+                    date=start_day, 
+                    page=page,
+                    abs=include_abs), 
+            HTTPStatus.MOVED_PERMANENTLY) #type: ignore
+
+    headers: Dict[str,str]={}
+    headers=add_surrogate_key(headers,["catchup",f"list-{start_day.year:04d}-{start_day.month:02d}-{subject.id}"])
+    #get data
+    listing=get_catchup_data(subject, start_day, include_abs, page)
+    next_announce_day=get_next_announce_day(start_day)
+
+    #format data
+    response_data: Dict[str, Any] = {}
+    headers.update({'Surrogate-Control': f'max-age={listing.expires}'})
+    count= listing.new_count+listing.cross_count+listing.rep_count
+    response_data['announced'] = listing.announced
+    skip=(page-1)*CATCHUP_LIMIT
+    response_data.update(catchup_index_for_types(listing.new_count, listing.cross_count, listing.rep_count,  subject, start_day, include_abs, page))
+    response_data.update(sub_sections_for_types(listing, skip, CATCHUP_LIMIT))
+
+    idx = 0
+    for item in listing.listings:
+        idx = idx + 1
+        setattr(item, 'list_index', idx + skip)
+
+    response_data['listings'] = listing.listings
+    response_data['author_links'] = authors_for_articles(listing.listings)
+    response_data['downloads'] = dl_for_articles(listing.listings)
+    response_data['latexml'] = latexml_links_for_articles(listing.listings)
+
+    response_data.update({
+        'subject':subject,
+        'date': start_day,
+        'next_day':next_announce_day,
+        'page':page,
+        'include_abs': include_abs,
+        'count': count,
+        'list_type':"new" if include_abs else "catchup", #how the list macro checks to display abstract
+        'paging': catchup_paging(subject, start_day, include_abs, page, count)
+    })
+
+    def author_query(article: DocMetadata, query: str)->str:
+        try:
+            if article.primary_archive:
+                archive_id = article.primary_archive.id
+            elif article.primary_category:
+                archive_id = article.primary_category.in_archive
+            else:
+                archive_id='' 
+            return str(url_for('search_archive',
+                           searchtype='author',
+                           archive=archive_id,
+                           query=query))
+        except (AttributeError, KeyError):
+            return str(url_for('search_archive',
+                               searchtype='author',
+                               archive=archive_id,
+                               query=query))
+
+    response_data['url_for_author_search'] = author_query
+
+    return response_data, 200, headers
+
+def get_catchup_form() -> Response:
+    headers: Dict[str,str]={}
+    headers=add_surrogate_key(headers,["catchup"])
+
+    #check for form/parameter requests
+    subject = request.args.get('subject')  
+    date = request.args.get('date') 
+    include_abs = request.args.get('include_abs') 
+    if subject and date:
+        if include_abs:
+            new_address= url_for('.catchup', subject=subject, date=date, abs=include_abs)
+        else:
+            new_address=url_for('.catchup', subject=subject, date=date)
+        headers.update({'Location':new_address})
+        headers.update({'Surrogate-Control': f'max-age=2600000'}) #one month, url construction should never change
+        headers=add_surrogate_key(headers,["catchup-redirect"])
+        return {}, 301, headers
+
+    #otherwise create catchup form
+    response_data: Dict[str, Any]= {}
+    response_data['years']= [datetime.now().year, datetime.now().year-1] #only last 90 days allowed anyways
+    response_data['months']= MONTHS[1:]
+    response_data['current_month']=datetime.now().strftime('%m')
+    response_data['days']= [str(day).zfill(2) for day in range(1, 32)]
+    response_data['groups']= GROUPS
+
+    headers=add_surrogate_key(headers,["catchup-form"])
+    headers.update({'Surrogate-Control': f'max-age=604800'}) #one week, form never changes except for autoselecting currently month
+    return response_data, 200, headers
+
+
+def _process_catchup_params(subject_str:str, date_str:str)->Tuple[Union[Group, Archive, Category], date, bool, int]:
+    """processes the request parameters to the catchup page
+    raises an error or returns usable values
+
+    Returns:
+    subject: as a Group, Archive, or Category. Still needs to be checked for canonicalness
+    start_day: date (date to catchup on)
+    abs: bool (include abstracts or not )
+    page: int (which page of results, default is 1)
+    """
+
+    #check for valid arguments
+    ALLOWED_PARAMS={"abs", "page"}
+    unexpected_params = request.args.keys() - ALLOWED_PARAMS
+    if unexpected_params:
+        raise BadRequest(f"Unexpected parameters. Only accepted parameters are: 'page', and 'abs'")
+
+    #subject validation
+    subject: Union[Group, Archive, Category]
+    if subject_str == "grp_physics":
+        subject=GROUPS["grp_physics"]
+    elif subject_str in ARCHIVES:
+        subject= ARCHIVES[subject_str]
+    elif subject_str in CATEGORIES:
+        subject= CATEGORIES[subject_str]
+    else:
+        raise BadRequest("Invalid subject. Subject must be an archive, category or 'grp_physics'")
+
+    #date validation
+    if not re.match(r"^\d{4}-\d{2}-\d{2}$", date_str): #enforce two digit days and months
+        raise BadRequest(f"Invalid date format. Use format: YYYY-MM-DD")
+    try:
+        start_day= datetime.strptime(date_str, "%Y-%m-%d").date()
+    except ValueError:
+        raise BadRequest(f"Invalid date format. Use format: YYYY-MM-DD")
+    #only allow dates within the last 90 days (91 just in case time zone differences)
+    today=datetime.now().date()
+    earliest_allowed=today - timedelta(days=91)
+    if start_day < earliest_allowed:
+        #TODO link to earliest allowed date
+        raise BadRequest(f"Invalid date: {start_day}. Catchup only allowed for past 90 days")
+    elif start_day > today:
+        raise BadRequest(f"Invalid date: {start_day}. Can't request date later than today")
+
+    #include abstract or not
+    abs_str=request.args.get("abs","False")
+    if abs_str == "True":
+        include_abs=True
+    elif abs_str == "False":
+        include_abs=False
+    else:
+        raise BadRequest(f"Invalid abs value. Use ?abs=True to include abstracts or ?abs=False to not")
+
+    #select page number (each page has 2000 items)
+    page_str = request.args.get("page", "1") #page defaults to 1
+    if page_str.isdigit():
+        page=int(page_str)
+    else:
+        raise BadRequest(f"Invalid page value. Page value should be a positive integer like ?page=3")
+    if page<1:
+        raise BadRequest(f"Invalid page value. Page value should be a positive integer like ?page=3")
+
+    return subject, start_day, include_abs, page
+
+def catchup_paging(subject: Union[Group, Archive, Category], day:date, include_abs:bool, page: int, count:int)-> List[Tuple[str,str]]:
+    '''creates a dictionary of page links for the case that there is more than one page of data'''
+    if CATCHUP_LIMIT >= count: #only one page
+        return []
+
+    total_pages=count//CATCHUP_LIMIT+1
+    url_base=url_for('.catchup', subject=subject.id, date=day.strftime('%Y-%m-%d'), abs=include_abs)
+    page_links=[]
+
+    if total_pages <10: #realistically there should be at most 2-3 pages per day
+        for i in range(1,total_pages+1):
+            if i == page:
+                page_links.append((str(i),'no-link'))
+            else:
+                page_links.append((str(i),url_base+f'&page={i}'))
+
+    else: #shouldnt happen but its handled
+        if page !=1:
+            page_links.append(('1',url_base+f'&page=1'))
+        if page >2:
+            page_links.append(('...','no-link'))
+        page_links.append((str(page),'no-link'))
+        if page <total_pages-1:
+            page_links.append(('...','no-link'))
+        if page !=total_pages:
+            page_links.append((str(total_pages), url_base+f'&page={total_pages}'))
+
+    return page_links
+
+def catchup_index_for_types(new_count:int, cross_count:int, rep_count:int,  subject: Union[Group, Archive, Category], day:date, include_abs:bool, page: int) ->Dict[str, Any]:
+    """Creates index for types for catchup papers. 
+    page count and index both start at 1
+    """
+    ift = []
+
+    if new_count > 0:
+        if page != 1:
+            ift.append(('New submissions',
+                        url_for('.catchup', subject=subject.id, date=day.strftime('%Y-%m-%d'), abs=include_abs, page=1),
+                        1))
+        else:
+            ift.append(('New submissions', '', 1)) 
+
+    if cross_count > 0:
+        cross_start = new_count + 1
+        cross_start_page=(cross_start-1)//CATCHUP_LIMIT +1 #item 2000 is on page 1, 2001 is on page 2
+        cross_index=cross_start-(cross_start_page-1)*CATCHUP_LIMIT 
+
+        if page==cross_start_page:
+            ift.append(('Cross-lists', '', cross_index))
+        else:
+            ift.append(('Cross-lists',
+                        url_for('.catchup', subject=subject.id, date=day.strftime('%Y-%m-%d'), abs=include_abs, page=cross_start_page),
+                        cross_index))
+
+    if rep_count > 0:
+        rep_start = new_count + cross_count+ 1
+        rep_start_page=(rep_start-1)//CATCHUP_LIMIT +1 #item 2000 is on page 1, 2001 is on page 2
+        rep_index=rep_start-(rep_start_page-1)*CATCHUP_LIMIT 
+
+        if page==rep_start_page:
+            ift.append(('Replacements', '', rep_index))
+        else:
+            ift.append(('Replacements',
+                        url_for('.catchup', subject=subject.id, date=day.strftime('%Y-%m-%d'), abs=include_abs, page=rep_start_page),
+                        rep_index))
+
+    return {'index_for_types': ift}
diff --git a/browse/controllers/list_page/__init__.py b/browse/controllers/list_page/__init__.py
@@ -549,7 +549,7 @@ def sub_sections_for_types(
         continued=skipn > 0,
         last=skipn >= new_count - shown,
         visible=len(news)>0,
-        heading=f'New submissions for {date} ' 
+        heading=f'New submissions ' 
     )
 
     sec_cross=ListingSection(
@@ -559,7 +559,7 @@ def sub_sections_for_types(
         continued=skipn + 1 > cross_start,
         last=skipn >= rep_start - shown,
         visible=len(crosses)>0,
-        heading=f'Cross submissions for {date} '
+        heading=f'Cross submissions '
     )
 
     sec_rep=ListingSection(
@@ -569,7 +569,7 @@ def sub_sections_for_types(
         continued=skipn + 1 > rep_start,
         last=last_shown >= new_count + cross_count + rep_count,
         visible=len(reps)>0,
-        heading=f'Replacement submissions for {date} '
+        heading=f'Replacement submissions '
     )
 
     secs=[sec_new, sec_cross, sec_rep]
@@ -582,7 +582,7 @@ def sub_sections_for_types(
                 showing = showing + 'last '
         if not sec.last and not sec.continued:
             showing = showing + 'first '
-        sec.heading += f'({showing}{len(sec.items)} of {sec.total} entries )'
+        sec.heading += f'({showing}{len(sec.items)} of {sec.total} entries)'
 
     return {'sub_sections_for_types': secs}
 

diff --git a/browse/routes/ui.py b/browse/routes/ui.py
@@ -30,7 +30,8 @@
     list_page,
     prevnext,
     stats_page,
-    tb_page
+    tb_page,
+    catchup_page
 )
 from browse.controllers.openurl_cookie import make_openurl_cookie, get_openurl_page
 from browse.controllers.cookies import get_cookies_page, cookies_to_set
@@ -129,6 +130,21 @@ def category_taxonomy() -> Any:
         None,
     )
 
+@blueprint.route("catchup", methods=["GET"], endpoint="catchup_form")
+def catchup_form() -> Response:
+    response, code, headers = catchup_page.get_catchup_form() 
+    if code == status.OK:
+        return render_template("catchup_form.html", **response), code, headers  # type: ignore
+    return response, code, headers  # type: ignore
+
+@blueprint.route("catchup/<subject>/<date>", methods=["GET"])
+def catchup(subject:str, date:str) -> Response:
+    response, code, headers = catchup_page.get_catchup_page(subject, date)
+    headers=add_surrogate_key(headers,["catchup"])
+    if code == status.OK:
+        return render_template("catchup.html", **response), code, headers  # type: ignore
+    return response, code, headers  # type: ignore
+
 @blueprint.route("institutional_banner", methods=["GET"])
 def institutional_banner() -> Any:
     try:

diff --git a/browse/routes/unimplemented.py b/browse/routes/unimplemented.py
@@ -18,9 +18,6 @@
 
 # these commenetd out blueprints are ones that the arxiv-browse or
 # arxiv-search systems should sooner or later should probably implement.
-#
-# @blueprint.route("/pdf/", defaults={"path":""})
-# @blueprint.route("/pdf/<path:path>")
 # @blueprint.route("/docmeta/", defaults={"path":""})
 # @blueprint.route("/docmeta/<path:path>")
 # @blueprint.route("/docmeta_bulk/", defaults={"path":""})
@@ -31,18 +28,8 @@
 # @blueprint.route("/tar/<path:path>")
 # @blueprint.route("/abstar/", defaults={"path":""})
 # @blueprint.route("/abstar/<path:path>")
-# @blueprint.route("/e-print/", defaults={"path":""})
-# @blueprint.route("/e-print/<path:path>")
-# @blueprint.route("/src/", defaults={"path":""})
-# @blueprint.route("/src/<path:path>")
-# @blueprint.route("/list/", defaults={"path":""})
-# @blueprint.route("/list/<path:path>")
 # @blueprint.route("/view/", defaults={"path":""})
 # @blueprint.route("/view/<path:path>")
-# @blueprint.route("/catchup/", defaults={"path":""})
-# @blueprint.route("/catchup/<path:path>")
-# @blueprint.route("/year/", defaults={"path":""})
-# @blueprint.route("/year/<path:path>")
 # @blueprint.route("/cits/", defaults={"path":""})
 # @blueprint.route("/cits/<path:path>")
 # @blueprint.route("/refs/", defaults={"path":""})
@@ -51,18 +38,12 @@
 # @blueprint.route("/ps/<path:path>")
 # @blueprint.route("/psfigs/", defaults={"path":""})
 # @blueprint.route("/psfigs/<path:path>")
-# @blueprint.route("/format/", defaults={"path":""})
-# @blueprint.route("/format/<path:path>")
 # @blueprint.route("/dvi/", defaults={"path":""})
 # @blueprint.route("/dvi/<path:path>")
-# @blueprint.route("/pdf/", defaults={"path":""})
-# @blueprint.route("/pdf/<path:path>")
 # @blueprint.route("/openurl-cookie/", defaults={"path":""})
 # @blueprint.route("/openurl-cookie/<path:path>")
 # @blueprint.route("/openurl-resolver/", defaults={"path":""})
 # @blueprint.route("/openurl-resolver/<path:path>")
-# @blueprint.route("/html/", defaults={"path":""})
-# @blueprint.route("/html/<path:path>")
 # @blueprint.route("/ftp/<path:path>")