Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gets integration tests passing #547

Draft
wants to merge 6 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions browse/controllers/files/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def unavailable(arxiv_id: Identifier) -> Response:


def not_pdf(arxiv_id: Identifier) -> Response:
return make_response(render_template("dissemination/unavailable.html",
return make_response(render_template("dissemination/not_pdf.html",
arxiv_id=arxiv_id), 404, {})


Expand Down Expand Up @@ -129,5 +129,5 @@ def bad_id(arxiv_id: Union[Identifier,str], err_msg: str) -> Response:

def cannot_build_pdf(arxiv_id: Identifier, msg: str) -> Response:
return make_response(render_template("dissemination/cannot_build_pdf.html",
err_msg=msg,
msg=msg,
arxiv_id=arxiv_id), 404, {})
2 changes: 1 addition & 1 deletion browse/controllers/files/dissemination.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def get_dissemination_resp(format: Acceptable_Format_Requests,
except IdentifierException as ex:
return bad_id(arxiv_id_str, str(ex))
item = get_article_store().dissemination(format, arxiv_id)
logger. debug(f"dissemination_for_id(%s) was %s", arxiv_id.idv, item)
logger.debug(f"dissemination (%s) was %s", arxiv_id.idv, item)
if not item or item == "VERSION_NOT_FOUND" or item == "ARTICLE_NOT_FOUND":
return not_found(arxiv_id)
elif item == "WITHDRAWN" or item == "NO_SOURCE":
Expand Down
9 changes: 3 additions & 6 deletions browse/services/dissemination/article_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import requests

from arxiv.identifier import Identifier
from arxiv.legacy.papers.dissemination.reasons import FORMATS
from arxiv.legacy.papers.dissemination.reasons import FORMATS, reasons
from arxiv.document.metadata import DocMetadata, VersionEntry
from arxiv.document.exceptions import (
AbsDeletedException, AbsNotFoundException, AbsVersionNotFoundException)
Expand Down Expand Up @@ -107,9 +107,6 @@ def _is_deleted(id: str) -> Optional[str]:
return DELETED_PAPERS.get(id, None)


def _unset_reasons(str: str, fmt:FORMATS) -> Optional[str]:
pass

def from_genpdf_location(location: str) -> typing.Tuple[str, str]:
"""Translates the genpdf-api redirect location for the genpdf object store.
returns the bucket name and key as a tuple if it is a gcp bucket.
Expand All @@ -136,7 +133,7 @@ def __init__(self,
metaservice: DocMetadataService,
objstore: ObjectStore,
genpdf_store: ObjectStore,
reasons: Callable[[str, FORMATS], Optional[str]] = _unset_reasons,
reasons: Callable[[str, FORMATS], Optional[str]] = reasons,
is_deleted: Callable[[str], Optional[str]] = _is_deleted
):
self.metadataservice = metaservice
Expand Down Expand Up @@ -318,7 +315,7 @@ def get_ancillary_files(self, docmeta: DocMetadata) -> List[Dict]:

def _pdf(self, arxiv_id: Identifier, docmeta: DocMetadata, version: VersionEntry) -> FormatHandlerReturn:
"""Handles getting the `FielObj` for a PDF request."""
if version.source_flag.cannot_pdf:
if version.source_flag.cannot_pdf or version.source_format == 'html':
return "NOT_PDF"

res = self.reasons(arxiv_id.idv, 'pdf')
Expand Down
2 changes: 1 addition & 1 deletion browse/templates/dissemination/bad_id.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% extends "base/base.html" %}
{% extends "base.html" %}

{# This is the template that gets shown when the client provides
something that is not a valid ID. #}
Expand Down
22 changes: 10 additions & 12 deletions browse/templates/dissemination/cannot_build_pdf.html
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
{% extends "base/base.html" %}
{% extends "base.html" %}

{%- block content -%}
<h1>PDF unavailable ...</h1>
<h1>PDF unavailable</h1>

<p>Our <b>automated</b> source to PDF conversion system was unable to produce PDF for the paper:
<p>Our automated source to PDF conversion system was unable to produce PDF for the paper
<a href="{{url_for('browse.abstract', arxiv_id=arxiv_id.ids)}}">{{arxiv_id.ids}}</a>.</p>

<p>Incomplete or corrupted files were submitted. This failure can be resolved by replacing missing or corrupted files. The reason for failure is recorded as:<br />
<b>{{msg}}</b></p>
<p>The reason for failure is recorded as: <b>{{msg}}</b>
</p>

<p>Return to the<a href="{{url_for('browse.abstract', arxiv_id=arxiv_id.ids)}}">abstract</a> for an
alternative link to the
<a href="{{url_for('src.src', arxiv_id_str=arxiv_id.ids)}}">source</a>, or to find an
email address to contact the author.</p>
<p>Return to the <a href="{{url_for('browse.abstract',
arxiv_id=arxiv_id.ids)}}">abstract</a> for an link to
the <a href="{{url_for('src.src', arxiv_id_str=arxiv_id.ids)}}">source</a> or
to find an email address to contact the author.</p>

<p>For help regarding the automated source to PDF system,
contact <a href="mailto:help@arxiv.org">help@arxiv.org</a>,
remembering to specify the problematic archive and paper number.</p>

contact <a href="mailto:help@arxiv.org">help@arxiv.org</a>.</p>
{%- endblock -%}
4 changes: 2 additions & 2 deletions browse/templates/dissemination/multiple_files.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% extends "base/base.html" %}
{% extends "base.html" %}

{%- block title -%}{{arxiv_id.ids}}{%- endblock -%}

Expand All @@ -9,4 +9,4 @@ <h1>The following files are available for {{ arxiv_id.idv }}</h1>
<li><a href="{{url_for('dissemination.html', arxiv_id=arxiv_id.idv)}}/{{file}}" title="{{file}}" id="{{ arxiv_id.ids }}-{{file}}" aria-labelledby="{{ arxiv_id.ids }}-{{file}}">{{file}}</a></li>
{% endfor %}
</ul>
{%- endblock -%}
{%- endblock -%}
2 changes: 1 addition & 1 deletion browse/templates/dissemination/no_html.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% extends "base/base.html" %}
{% extends "base.html" %}

{# This is when an html version is unavailable for some reason,
either we havent converted the latex or failed to, or its another type of document that isnt latex or native html #}
Expand Down
2 changes: 1 addition & 1 deletion browse/templates/dissemination/not_found.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% extends "base/base.html" %}
{% extends "base.html" %}

{# This is when the arxiv ID from the client is valid but the article
or version does not exist. #}
Expand Down
13 changes: 13 additions & 0 deletions browse/templates/dissemination/not_pdf.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{% extends "base.html" %}

{# This is the template that gets shown when the paper and version
exists, but the source is not PDF and the source cannot build a PDF. Ex HTML source #}

{%- block content -%}
<h1>{{arxiv_id.ids}} does not build PDF</h1>
<p>
The source for the paper
<a href="{{url_for('browse.abstract', arxiv_id=arxiv_id.ids)}}">{{arxiv_id.id}}</a>
cannot built to a PDF by our systems.
</p>
{%- endblock -%}
11 changes: 7 additions & 4 deletions browse/templates/dissemination/unavailable.html
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
{% extends "base/base.html" %}
{% extends "base.html" %}

{# This is the template that gets shown when the paper and version
exists, but there is no file like a PDF. This is an error condition that should
not happen but needs to be covered. #}
exists, but there is no file like a PDF. This can happen when the source is
HTML. #}

{%- block content -%}
<h1>file unavailable</h1>
<p>Our automated source to PDF conversion system has failed to produce PDF for the paper {{arxiv_id.ids}}.</p>
<p>
Our automated source to PDF conversion system has failed to produce PDF for the paper
<a href="{{url_for('browse.abstract', arxiv_id=arxiv_id.id)}}">{{arxiv_id.id}}</a>.
</p>
{%- endblock -%}
2 changes: 1 addition & 1 deletion browse/templates/dissemination/withdrawn.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% extends "base/base.html" %}
{% extends "base.html" %}

{%- block title -%}No file for {{arxiv_id.ids}}{%- endblock -%}

Expand Down
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ def _app_with_db():
@pytest.mark.integration
def test_something():
...

Run this with: `HOST=https://browse.dev.arxiv.org pytset --runintegration tests/dissemination/test_integration.py`
"""

def pytest_addoption(parser):
Expand Down
58 changes: 32 additions & 26 deletions tests/dissemination/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import requests
import os

EXPECTED_WDR_STATUS = 404

@pytest.fixture
def host():
return os.environ.get('HOST', 'http://localhost:8080')
Expand Down Expand Up @@ -223,21 +225,26 @@ def test_wdr(host):
"""These are some verisons that are withdrawls."""

resp = requests.get(f"{host}/pdf/0911.3270v2")
assert resp.status_code == 200 # this version is wdr and in the legacy sytem does a 200 with a message like "paper not available"
assert resp.status_code == EXPECTED_WDR_STATUS # this version is wdr and in the legacy sytem does a 200 with a message like "paper not available"

resp = requests.get(f"{host}/pdf/0911.3270v3")
# paper exists but this version does not exist. The legacy system
# does something similar to a withdrawn in that it retunrs a 200 and a message like
# "source to generate pdf for this doesn't exist" but it should be a 404
assert resp.status_code == EXPECTED_WDR_STATUS
assert b'unavailable' in resp.content

# paper exists and v2 is wdr, but v1 is still available
resp = requests.head(f"{host}/pdf/2212.03351v1")
assert resp.status_code == 200
assert 'unavailable' in resp.text

resp = requests.get(f"{host}/pdf/2212.03351v1")
resp = requests.get(f"{host}/abs/2212.03351v1")
assert resp.status_code == 200
assert b"newer version of this paper has been withdrawn" in resp.content

resp = requests.get(f"{host}/pdf/2212.03351v2")
assert resp.status_code == 200
assert 'unavailable' in resp.text
assert resp.status_code == EXPECTED_WDR_STATUS
assert b'unavailable' in resp.content


@pytest.mark.integration
Expand Down Expand Up @@ -300,15 +307,7 @@ def test_does_not_exist_ps_cache(host):


@pytest.mark.integration
def test_withdrawn(host):
"""Sample of withdrawn versions"""
EXPECTED_WDR_STATUS = 404
def integration_test_of_withdrawn(arxiv_id):
resp = requests.get(f"{host}/pdf/{arxiv_id}")
assert resp.status_code == EXPECTED_WDR_STATUS, f"For withdrawn paper {arxiv_id} HTTP status code should have been {EXPECTED_WDR_STATUS} but was {resp.status_code}"
assert "The author has provided no source" in resp.text, f"For withdrawn paper {arxiv_id} the expected message was not found in the response"

wdr_ids =[
@pytest.mark.parametrize("arxiv_id", [
'1501.02398v2',
'0910.1713v4', '1307.0741v2', '2008.09101v2',
'cs/0606100v4', '2005.02207v2', '1901.07935v4', '1512.08657v3',
Expand All @@ -326,15 +325,18 @@ def integration_test_of_withdrawn(arxiv_id):
'1803.07743v4', 'cs/0612028v2', '1401.1740v2', '1603.06209v2',
'2112.02249v2', '2207.11705v3', '2208.13435v3',
'2208.13514v2', '2106.03507v4', '1211.2296v4',
'cond-mat/9810209v2', 'cond-mat/0212346v3', '1708.09372v2', ]
'cond-mat/9810209v2', 'cond-mat/0212346v3', '1708.09372v2', ])
def test_withdrawn(host, arxiv_id):
"""Sample of withdrawn versions"""
resp = requests.get(f"{host}/pdf/{arxiv_id}")
assert resp.status_code == EXPECTED_WDR_STATUS, f"For withdrawn paper {arxiv_id} HTTP status code should have been {EXPECTED_WDR_STATUS} but was {resp.status_code}"
assert "File unavailable" in resp.text, f"For withdrawn paper {arxiv_id} the expected message was not found in the response"

for arxiv_id in wdr_ids:
integration_test_of_withdrawn(arxiv_id)


@pytest.mark.integration
def test_html_src(host):
"""Submissions with HTML source"""
"""Submissions with HTML source."""
# legacy returns 200 with msg: "Unavailable, The author has provided no source to generate PDF, and no PDF."
resp = requests.get(f"{host}/pdf/cond-mat/9906075v1")
assert resp.status_code == 404
Expand All @@ -346,30 +348,30 @@ def test_html_src(host):
@pytest.mark.integration
def test_reasons(host):
"""Paper in reasons"""
msg = "submitter supplied incomplete or corrupted files"
msg = b"Incomplete or corrupted files"
resp = requests.get(f"{host}/pdf/1808.02949v1")
assert resp.status_code == 404
assert msg in resp.text
assert msg in resp.content

resp = requests.get(f"{host}/pdf/1310.4962")
assert resp.status_code == 404
assert msg in resp.text
assert msg in resp.content
resp = requests.get(f"{host}/pdf/1310.4962v1")
assert resp.status_code == 404
assert msg in resp.text
assert msg in resp.content
resp = requests.get(f"{host}/pdf/1310.4962v2")
assert resp.status_code == 404
assert msg in resp.text
assert msg in resp.content

resp = requests.get(f"{host}/pdf/physics/0411006")
assert resp.status_code == 404
assert msg in resp.text
assert msg in resp.content
resp = requests.get(f"{host}/pdf/physics/0411006")
assert resp.status_code == 404
assert msg in resp.text
assert msg in resp.content
resp = requests.get(f"{host}/pdf/physics/0411006")
assert resp.status_code == 404
assert msg in resp.text
assert msg in resp.content


@pytest.mark.integration
Expand Down Expand Up @@ -419,3 +421,7 @@ def test_deleted(host):
# resp = requests.get(f"{host}/pdf/physics/0411006")
# assert resp.status_code == 404
# assert msg in resp.text

@pytest.mark.integration
def test_html(host):
pass
Loading