Skip to content

Commit

Permalink
TIKA-4153 -- revert changes to robots.txt detection and add unit test…
Browse files Browse the repository at this point in the history
… for robots file starting with comments

(cherry picked from commit 7825b59)
  • Loading branch information
tballison committed Oct 16, 2023
1 parent a427bac commit 575298c
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2135,16 +2135,14 @@
<!-- should have a higher priority than rfc822 - TIKA-3489 -->
<magic priority="55">
<match minShouldMatch="2">
<match minShouldMatch="1">
<match value="user-agent:" type="stringignorecase" offset="0"/>
<match value="sitemap:" type="stringignorecase" offset="0"/>
</match>
<match minShouldMatch="1">
<match value="\nuser-agent:" type="stringignorecase" offset="0:1000"/>
<match value="\nallow:" type="stringignorecase" offset="0:1000"/>
<match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
<match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
</match>
<match value="user-agent:" type="stringignorecase" offset="0"/>
<match value="allow:" type="stringignorecase" offset="0"/>
<match value="disallow:" type="stringignorecase" offset="0"/>
<match value="sitemap:" type="stringignorecase" offset="0"/>
<match value="\nuser-agent:" type="stringignorecase" offset="0:1000"/>
<match value="\nallow:" type="stringignorecase" offset="0:1000"/>
<match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
<match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
</match>
</magic>
<sub-class-of type="text/plain"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,7 @@ public void testEmail() throws IOException {
@Test
public void testRobots() throws Exception {
assertTypeByData("text/x-robots", "testRobots.txt");
assertTypeByData("text/x-robots", "testRobots2.txt");
}

@Test
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# elevate robots begin
# robots.txt, added by the Elevate plugin for WordPress
# file version: 1
User-agent: *
Disallow: /wp-admin/
Disallow: /readme.html
Disallow: /trackback/
Allow: /wp-admin/admin-ajax.php
Allow: /wp-content/uploads
Sitemap: https://blahdeblah.com/sitemap.xml
# elevate robots end

0 comments on commit 575298c

Please sign in to comment.