From 560d6d1776ee3a13af26ac4e04171a5d32f3923f Mon Sep 17 00:00:00 2001 From: Matthew Kesack Date: Wed, 25 Sep 2024 04:55:09 -0400 Subject: [PATCH] Fixes "case-sensitive" URI matching for Disallow rules in robots.txt (#46) * Fixes "case-sensitive" URI matching for Disallow rules in robots.txt Based on Issue #45 (Robots.txt "Disallow" URI matching should be case-sensitive) I removed the use of `strtolower` in `parseDisallow` to preserve the URI's case sensitivity. The issue was opened based on RFC standard by google which indicates: "The value of the disallow rule is case-sensitive." (Source: https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt?hl=en#disallow) --- I ran PHP-Unit and all tests passed since none were specifically testing case-sensitivity. I added test the_disallows_uri_check_is_case_sensitive to cover this issue. * Remove .idea files --------- Co-authored-by: Matthew Kesack --- src/RobotsTxt.php | 2 +- tests/RobotsTxtTest.php | 9 +++++++++ tests/data/robots.txt | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/RobotsTxt.php b/src/RobotsTxt.php index f5d0e14..5520b9e 100644 --- a/src/RobotsTxt.php +++ b/src/RobotsTxt.php @@ -253,7 +253,7 @@ protected function parseUserAgent(string $line): string protected function parseDisallow(string $line): string { - return trim(substr_replace(strtolower(trim($line)), '', 0, 8), ': '); + return trim(substr_replace(trim($line), '', 0, 8), ': '); } protected function isDisallowLine(string $line): string diff --git a/tests/RobotsTxtTest.php b/tests/RobotsTxtTest.php index 290848a..55da2ed 100644 --- a/tests/RobotsTxtTest.php +++ b/tests/RobotsTxtTest.php @@ -149,6 +149,15 @@ public function the_disallows_user_agent_check_is_case_insensitive() $this->assertFalse($robots->allows('/no-agents', strtolower('UserAgent007'))); } + /** @test */ + public function the_disallows_uri_check_is_case_sensitive() + { + $robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt'); + + $this->assertFalse($robots->allows('/Case-Sensitive/Disallow')); + $this->assertTrue($robots->allows(strtolower('/Case-Sensitive/Disallow'))); + } + /** @test */ public function it_can_handle_multiple_user_agent_query_strings() { diff --git a/tests/data/robots.txt b/tests/data/robots.txt index bce017c..13a388b 100644 --- a/tests/data/robots.txt +++ b/tests/data/robots.txt @@ -7,6 +7,7 @@ Disallow: /nl/admin/ Disallow: /en/admin/* Disallow: /fr/admin$ Disallow: /es/admin-disallow/ +Disallow: /Case-Sensitive/Disallow User-agent: google Disallow: /