Skip to content

Commit

Permalink
Merge pull request #724 from arXiv/ARXIVCE-2412-rebuild-catchup
Browse files Browse the repository at this point in the history
Arxivce 2412 rebuild catchup
  • Loading branch information
kyokukou authored Sep 19, 2024
2 parents 04a4f4f + 8f88d3d commit 54fee52
Show file tree
Hide file tree
Showing 13 changed files with 1,437 additions and 82 deletions.
7 changes: 3 additions & 4 deletions browse/controllers/archive_page/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Archive landing page."""

import datetime
from datetime import datetime
from typing import Any, Dict, List, Tuple, Optional
from http import HTTPStatus as status

Expand Down Expand Up @@ -48,15 +48,14 @@ def get_archive(archive_id: Optional[str]) -> Tuple[Dict[str, Any], int, Dict[st
archive = subsuming_category.get_archive()

years = years_operating(archive)
data["years"] = years
data["years"] = [datetime.now().year, datetime.now().year-1] #only last 90 days allowed anyways
data["months"] = MONTHS
data["days"] = DAYS
data["archive"] = archive
data["list_form"] = ByMonthForm(archive, years)
data["stats_by_year"] = stats_by_year(archive, years)
data["category_list"] = category_list(archive)

data["catchup_to"] = datetime.date.today() - datetime.timedelta(days=7)
data["current_month"] = datetime.now().strftime('%m')
data["template"] = "archive/single_archive.html"
return data, status.OK, response_headers

Expand Down
256 changes: 256 additions & 0 deletions browse/controllers/catchup_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
"""handles requests to the catchup page.
Allows users to access something equivalent to the /new page for up to 90 days back
"""
import re
from typing import Tuple, Union, Dict, Any, List
from datetime import date, datetime, timedelta

from http import HTTPStatus
from flask import request, redirect, url_for
from werkzeug.exceptions import BadRequest

from arxiv.document.metadata import DocMetadata
from arxiv.integration.fastly.headers import add_surrogate_key
from arxiv.taxonomy.category import Group, Archive, Category
from arxiv.taxonomy.definitions import CATEGORIES, ARCHIVES, GROUPS, ARCHIVES_ACTIVE

from browse.controllers.archive_page.by_month_form import MONTHS
from browse.controllers.list_page import latexml_links_for_articles, dl_for_articles, authors_for_articles, sub_sections_for_types, Response
from browse.services.database.catchup import get_catchup_data, CATCHUP_LIMIT, get_next_announce_day

def get_catchup_page(subject_str:str, date:str)-> Response:
"""get the catchup page for a given set of request parameters
see process_catchup_params for details on parameters
"""
subject, start_day, include_abs, page=_process_catchup_params(subject_str, date)
#check for redirects for noncanon subjects
if subject.id != subject.canonical_id:
return redirect(
url_for('catchup',
subject=subject.canonical_id,
date=start_day,
page=page,
abs=include_abs),
HTTPStatus.MOVED_PERMANENTLY) #type: ignore

headers: Dict[str,str]={}
headers=add_surrogate_key(headers,["catchup",f"list-{start_day.year:04d}-{start_day.month:02d}-{subject.id}"])
#get data
listing=get_catchup_data(subject, start_day, include_abs, page)
next_announce_day=get_next_announce_day(start_day)

#format data
response_data: Dict[str, Any] = {}
headers.update({'Surrogate-Control': f'max-age={listing.expires}'})
count= listing.new_count+listing.cross_count+listing.rep_count
response_data['announced'] = listing.announced
skip=(page-1)*CATCHUP_LIMIT
response_data.update(catchup_index_for_types(listing.new_count, listing.cross_count, listing.rep_count, subject, start_day, include_abs, page))
response_data.update(sub_sections_for_types(listing, skip, CATCHUP_LIMIT))

idx = 0
for item in listing.listings:
idx = idx + 1
setattr(item, 'list_index', idx + skip)

response_data['listings'] = listing.listings
response_data['author_links'] = authors_for_articles(listing.listings)
response_data['downloads'] = dl_for_articles(listing.listings)
response_data['latexml'] = latexml_links_for_articles(listing.listings)

response_data.update({
'subject':subject,
'date': start_day,
'next_day':next_announce_day,
'page':page,
'include_abs': include_abs,
'count': count,
'list_type':"new" if include_abs else "catchup", #how the list macro checks to display abstract
'paging': catchup_paging(subject, start_day, include_abs, page, count)
})

def author_query(article: DocMetadata, query: str)->str:
try:
if article.primary_archive:
archive_id = article.primary_archive.id
elif article.primary_category:
archive_id = article.primary_category.in_archive
else:
archive_id=''
return str(url_for('search_archive',
searchtype='author',
archive=archive_id,
query=query))
except (AttributeError, KeyError):
return str(url_for('search_archive',
searchtype='author',
archive=archive_id,
query=query))

response_data['url_for_author_search'] = author_query

return response_data, 200, headers

def get_catchup_form() -> Response:
headers: Dict[str,str]={}
headers=add_surrogate_key(headers,["catchup"])

#check for form/parameter requests
subject = request.args.get('subject')
date = request.args.get('date')
include_abs = request.args.get('include_abs')
if subject and date:
if include_abs:
new_address= url_for('.catchup', subject=subject, date=date, abs=include_abs)
else:
new_address=url_for('.catchup', subject=subject, date=date)
headers.update({'Location':new_address})
headers.update({'Surrogate-Control': f'max-age=2600000'}) #one month, url construction should never change
headers=add_surrogate_key(headers,["catchup-redirect"])
return {}, 301, headers

#otherwise create catchup form
response_data: Dict[str, Any]= {}
response_data['years']= [datetime.now().year, datetime.now().year-1] #only last 90 days allowed anyways
response_data['months']= MONTHS[1:]
response_data['current_month']=datetime.now().strftime('%m')
response_data['days']= [str(day).zfill(2) for day in range(1, 32)]
response_data['groups']= GROUPS

headers=add_surrogate_key(headers,["catchup-form"])
headers.update({'Surrogate-Control': f'max-age=604800'}) #one week, form never changes except for autoselecting currently month
return response_data, 200, headers


def _process_catchup_params(subject_str:str, date_str:str)->Tuple[Union[Group, Archive, Category], date, bool, int]:
"""processes the request parameters to the catchup page
raises an error or returns usable values
Returns:
subject: as a Group, Archive, or Category. Still needs to be checked for canonicalness
start_day: date (date to catchup on)
abs: bool (include abstracts or not )
page: int (which page of results, default is 1)
"""

#check for valid arguments
ALLOWED_PARAMS={"abs", "page"}
unexpected_params = request.args.keys() - ALLOWED_PARAMS
if unexpected_params:
raise BadRequest(f"Unexpected parameters. Only accepted parameters are: 'page', and 'abs'")

#subject validation
subject: Union[Group, Archive, Category]
if subject_str == "grp_physics":
subject=GROUPS["grp_physics"]
elif subject_str in ARCHIVES:
subject= ARCHIVES[subject_str]
elif subject_str in CATEGORIES:
subject= CATEGORIES[subject_str]
else:
raise BadRequest("Invalid subject. Subject must be an archive, category or 'grp_physics'")

#date validation
if not re.match(r"^\d{4}-\d{2}-\d{2}$", date_str): #enforce two digit days and months
raise BadRequest(f"Invalid date format. Use format: YYYY-MM-DD")
try:
start_day= datetime.strptime(date_str, "%Y-%m-%d").date()
except ValueError:
raise BadRequest(f"Invalid date format. Use format: YYYY-MM-DD")
#only allow dates within the last 90 days (91 just in case time zone differences)
today=datetime.now().date()
earliest_allowed=today - timedelta(days=91)
if start_day < earliest_allowed:
#TODO link to earliest allowed date
raise BadRequest(f"Invalid date: {start_day}. Catchup only allowed for past 90 days")
elif start_day > today:
raise BadRequest(f"Invalid date: {start_day}. Can't request date later than today")

#include abstract or not
abs_str=request.args.get("abs","False")
if abs_str == "True":
include_abs=True
elif abs_str == "False":
include_abs=False
else:
raise BadRequest(f"Invalid abs value. Use ?abs=True to include abstracts or ?abs=False to not")

#select page number (each page has 2000 items)
page_str = request.args.get("page", "1") #page defaults to 1
if page_str.isdigit():
page=int(page_str)
else:
raise BadRequest(f"Invalid page value. Page value should be a positive integer like ?page=3")
if page<1:
raise BadRequest(f"Invalid page value. Page value should be a positive integer like ?page=3")

return subject, start_day, include_abs, page

def catchup_paging(subject: Union[Group, Archive, Category], day:date, include_abs:bool, page: int, count:int)-> List[Tuple[str,str]]:
'''creates a dictionary of page links for the case that there is more than one page of data'''
if CATCHUP_LIMIT >= count: #only one page
return []

total_pages=count//CATCHUP_LIMIT+1
url_base=url_for('.catchup', subject=subject.id, date=day.strftime('%Y-%m-%d'), abs=include_abs)
page_links=[]

if total_pages <10: #realistically there should be at most 2-3 pages per day
for i in range(1,total_pages+1):
if i == page:
page_links.append((str(i),'no-link'))
else:
page_links.append((str(i),url_base+f'&page={i}'))

else: #shouldnt happen but its handled
if page !=1:
page_links.append(('1',url_base+f'&page=1'))
if page >2:
page_links.append(('...','no-link'))
page_links.append((str(page),'no-link'))
if page <total_pages-1:
page_links.append(('...','no-link'))
if page !=total_pages:
page_links.append((str(total_pages), url_base+f'&page={total_pages}'))

return page_links

def catchup_index_for_types(new_count:int, cross_count:int, rep_count:int, subject: Union[Group, Archive, Category], day:date, include_abs:bool, page: int) ->Dict[str, Any]:
"""Creates index for types for catchup papers.
page count and index both start at 1
"""
ift = []

if new_count > 0:
if page != 1:
ift.append(('New submissions',
url_for('.catchup', subject=subject.id, date=day.strftime('%Y-%m-%d'), abs=include_abs, page=1),
1))
else:
ift.append(('New submissions', '', 1))

if cross_count > 0:
cross_start = new_count + 1
cross_start_page=(cross_start-1)//CATCHUP_LIMIT +1 #item 2000 is on page 1, 2001 is on page 2
cross_index=cross_start-(cross_start_page-1)*CATCHUP_LIMIT

if page==cross_start_page:
ift.append(('Cross-lists', '', cross_index))
else:
ift.append(('Cross-lists',
url_for('.catchup', subject=subject.id, date=day.strftime('%Y-%m-%d'), abs=include_abs, page=cross_start_page),
cross_index))

if rep_count > 0:
rep_start = new_count + cross_count+ 1
rep_start_page=(rep_start-1)//CATCHUP_LIMIT +1 #item 2000 is on page 1, 2001 is on page 2
rep_index=rep_start-(rep_start_page-1)*CATCHUP_LIMIT

if page==rep_start_page:
ift.append(('Replacements', '', rep_index))
else:
ift.append(('Replacements',
url_for('.catchup', subject=subject.id, date=day.strftime('%Y-%m-%d'), abs=include_abs, page=rep_start_page),
rep_index))

return {'index_for_types': ift}
8 changes: 4 additions & 4 deletions browse/controllers/list_page/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ def sub_sections_for_types(
continued=skipn > 0,
last=skipn >= new_count - shown,
visible=len(news)>0,
heading=f'New submissions for {date} '
heading=f'New submissions '
)

sec_cross=ListingSection(
Expand All @@ -559,7 +559,7 @@ def sub_sections_for_types(
continued=skipn + 1 > cross_start,
last=skipn >= rep_start - shown,
visible=len(crosses)>0,
heading=f'Cross submissions for {date} '
heading=f'Cross submissions '
)

sec_rep=ListingSection(
Expand All @@ -569,7 +569,7 @@ def sub_sections_for_types(
continued=skipn + 1 > rep_start,
last=last_shown >= new_count + cross_count + rep_count,
visible=len(reps)>0,
heading=f'Replacement submissions for {date} '
heading=f'Replacement submissions '
)

secs=[sec_new, sec_cross, sec_rep]
Expand All @@ -582,7 +582,7 @@ def sub_sections_for_types(
showing = showing + 'last '
if not sec.last and not sec.continued:
showing = showing + 'first '
sec.heading += f'({showing}{len(sec.items)} of {sec.total} entries )'
sec.heading += f'({showing}{len(sec.items)} of {sec.total} entries)'

return {'sub_sections_for_types': secs}

Expand Down
18 changes: 17 additions & 1 deletion browse/routes/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
list_page,
prevnext,
stats_page,
tb_page
tb_page,
catchup_page
)
from browse.controllers.openurl_cookie import make_openurl_cookie, get_openurl_page
from browse.controllers.cookies import get_cookies_page, cookies_to_set
Expand Down Expand Up @@ -129,6 +130,21 @@ def category_taxonomy() -> Any:
None,
)

@blueprint.route("catchup", methods=["GET"], endpoint="catchup_form")
def catchup_form() -> Response:
response, code, headers = catchup_page.get_catchup_form()
if code == status.OK:
return render_template("catchup_form.html", **response), code, headers # type: ignore
return response, code, headers # type: ignore

@blueprint.route("catchup/<subject>/<date>", methods=["GET"])
def catchup(subject:str, date:str) -> Response:
response, code, headers = catchup_page.get_catchup_page(subject, date)
headers=add_surrogate_key(headers,["catchup"])
if code == status.OK:
return render_template("catchup.html", **response), code, headers # type: ignore
return response, code, headers # type: ignore

@blueprint.route("institutional_banner", methods=["GET"])
def institutional_banner() -> Any:
try:
Expand Down
19 changes: 0 additions & 19 deletions browse/routes/unimplemented.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@

# these commenetd out blueprints are ones that the arxiv-browse or
# arxiv-search systems should sooner or later should probably implement.
#
# @blueprint.route("/pdf/", defaults={"path":""})
# @blueprint.route("/pdf/<path:path>")
# @blueprint.route("/docmeta/", defaults={"path":""})
# @blueprint.route("/docmeta/<path:path>")
# @blueprint.route("/docmeta_bulk/", defaults={"path":""})
Expand All @@ -31,18 +28,8 @@
# @blueprint.route("/tar/<path:path>")
# @blueprint.route("/abstar/", defaults={"path":""})
# @blueprint.route("/abstar/<path:path>")
# @blueprint.route("/e-print/", defaults={"path":""})
# @blueprint.route("/e-print/<path:path>")
# @blueprint.route("/src/", defaults={"path":""})
# @blueprint.route("/src/<path:path>")
# @blueprint.route("/list/", defaults={"path":""})
# @blueprint.route("/list/<path:path>")
# @blueprint.route("/view/", defaults={"path":""})
# @blueprint.route("/view/<path:path>")
# @blueprint.route("/catchup/", defaults={"path":""})
# @blueprint.route("/catchup/<path:path>")
# @blueprint.route("/year/", defaults={"path":""})
# @blueprint.route("/year/<path:path>")
# @blueprint.route("/cits/", defaults={"path":""})
# @blueprint.route("/cits/<path:path>")
# @blueprint.route("/refs/", defaults={"path":""})
Expand All @@ -51,18 +38,12 @@
# @blueprint.route("/ps/<path:path>")
# @blueprint.route("/psfigs/", defaults={"path":""})
# @blueprint.route("/psfigs/<path:path>")
# @blueprint.route("/format/", defaults={"path":""})
# @blueprint.route("/format/<path:path>")
# @blueprint.route("/dvi/", defaults={"path":""})
# @blueprint.route("/dvi/<path:path>")
# @blueprint.route("/pdf/", defaults={"path":""})
# @blueprint.route("/pdf/<path:path>")
# @blueprint.route("/openurl-cookie/", defaults={"path":""})
# @blueprint.route("/openurl-cookie/<path:path>")
# @blueprint.route("/openurl-resolver/", defaults={"path":""})
# @blueprint.route("/openurl-resolver/<path:path>")
# @blueprint.route("/html/", defaults={"path":""})
# @blueprint.route("/html/<path:path>")
# @blueprint.route("/ftp/<path:path>")


Expand Down
Loading

0 comments on commit 54fee52

Please sign in to comment.