From 17fa779bab85066080948569691649d8f599154e Mon Sep 17 00:00:00 2001 From: Damien Alexandre Date: Mon, 16 Jun 2014 18:44:36 +0200 Subject: [PATCH 1/5] Fix #7, force the content encoding and hack around libxml2 bugs --- composer.json | 2 +- src/JoliTypo/Fixer.php | 59 +++++++++++++++++++-------- tests/JoliTypo/Tests/Html5Test.php | 25 ++++++++++++ tests/JoliTypo/Tests/JoliTypoTest.php | 4 +- 4 files changed, 71 insertions(+), 19 deletions(-) diff --git a/composer.json b/composer.json index 335d127..1c04760 100644 --- a/composer.json +++ b/composer.json @@ -15,7 +15,7 @@ "require": { "php": ">=5.3.0", "ext-mbstring": "*", - "lib-libxml": ">2.6.32", + "lib-libxml": "*", "org_heigl/hyphenator": ">=2.0.3" }, "conflict": { diff --git a/src/JoliTypo/Fixer.php b/src/JoliTypo/Fixer.php index e4eab33..01c35b0 100644 --- a/src/JoliTypo/Fixer.php +++ b/src/JoliTypo/Fixer.php @@ -201,30 +201,57 @@ private function loadDOMDocument($content) $dom->substituteEntities = false; $dom->formatOutput = false; + // Change mb and libxml config $libxml_current = libxml_use_internal_errors(true); + $mb_detect_current = mb_detect_order(); + mb_detect_order("ASCII,UTF-8,ISO-8859-1,windows-1252,iso-8859-15"); - // Little hack to force UTF-8 - if (strpos($content, '' : ''; - $loaded = $dom->loadHTML($hack . $content); - } else { - $loaded = $dom->loadHTML($content); - } + $loaded = $dom->loadHTML($this->fixContentEncoding($content)); + // Restore mb and libxml config libxml_use_internal_errors($libxml_current); + mb_detect_order(implode(',', $mb_detect_current)); if (!$loaded) { throw new InvalidMarkupException("Can't load the given HTML via DomDocument"); } - foreach ($dom->childNodes as $item) { - if ($item->nodeType === XML_PI_NODE) { - $dom->removeChild($item); // remove encoding hack - break; - } + return $dom; + } + + /** + * Convert the content encoding properly and add Content-Type meta if HTML document + * + * @see http://php.net/manual/en/domdocument.loadhtml.php#91513 + * @see https://github.com/jolicode/JoliTypo/issues/7 + * + * @param $content + * @return string + */ + private function fixContentEncoding($content) + { + if (!empty($content)) { + // Little hack to force UTF-8 + if (strpos($content, '' : ''; + $content = $hack . $content; + } + + $encoding = mb_detect_encoding($content); + $headpos = mb_strpos($content, ''); + + // Add a meta to the section + if (false !== $headpos) { + $headpos +=6; + $content = mb_substr($content, 0, $headpos) . + '' . + mb_substr($content, $headpos); + } + + $content = mb_convert_encoding($content, 'HTML-ENTITIES', $encoding); } - return $dom; + return $content; } /** @@ -235,9 +262,9 @@ private function exportDOMDocument(\DOMDocument $dom) { // Remove added body & doctype $content = preg_replace(array( - "/^\<\!DOCTYPE.*?/si", - "!$!si"), - "", $dom->saveHTML()); + "/^\<\!DOCTYPE.*?.*?/si", + "!$!si" + ), "", $dom->saveHTML()); return trim($content); } diff --git a/tests/JoliTypo/Tests/Html5Test.php b/tests/JoliTypo/Tests/Html5Test.php index 9adedfd..8235a7a 100644 --- a/tests/JoliTypo/Tests/Html5Test.php +++ b/tests/JoliTypo/Tests/Html5Test.php @@ -17,4 +17,29 @@ public function testHtml5Markup() // The test passes if there is no warning about this fix: $this->assertEquals($html5, $fixer->fix($html5)); } + + public function testFullPageMarkup() + { + $fixer = new Fixer(array(new Fixer\EnglishQuotes())); + $this->assertInstanceOf('JoliTypo\Fixer', $fixer); + + $html = << + + + + Coucou + + + "Who Let the Dogs Out?" is a song written and originally recorded by Anslem Douglas (titled "Doggie"). + + +HTML; + + $fixed = <<assertEquals($fixed, $fixer->fix($html)); + } } diff --git a/tests/JoliTypo/Tests/JoliTypoTest.php b/tests/JoliTypo/Tests/JoliTypoTest.php index 6e5c514..7cf1974 100644 --- a/tests/JoliTypo/Tests/JoliTypoTest.php +++ b/tests/JoliTypo/Tests/JoliTypoTest.php @@ -135,10 +135,10 @@ public function testBadEncoding() $this->assertEquals("Mentions Légales", $fixer->fix(utf8_encode(utf8_decode("Mentions Légales")))); - // JoliTypo can't handle double encoded UTF-8 strings, nor ISO strings + // JoliTypo can handle double encoded UTF-8 strings, or ISO strings, but that's not a feature. $isoString = mb_convert_encoding("Mentions Légales", "ISO-8859-1", "UTF-8"); $this->assertEquals("Mentions Légales", $fixer->fix(utf8_encode($isoString))); - $this->assertNotEquals("Mentions Légales", $fixer->fix($isoString)); + $this->assertEquals("Mentions Légales", $fixer->fix($isoString)); $this->assertEquals("Mentions Légales", $fixer->fix(utf8_encode(utf8_encode($isoString)))); } } From a7fc101cb5b0a99ef83b775513e1ee3eb3338d99 Mon Sep 17 00:00:00 2001 From: Damien Alexandre Date: Mon, 16 Jun 2014 18:48:38 +0200 Subject: [PATCH 2/5] Complete the requirements and add a warning about --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5762100..bbe1a75 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ It's designed to be: Quick usage =========== -Just tell the Fixer class [which Fixer](#available-fixers) you want to run on your HTML content and then, call `fix()`: +Just tell the Fixer class [which Fixer](#available-fixers) you want to run on your **HTML contents** and then, call `fix()`: ```php use JoliTypo\Fixer; @@ -50,9 +50,13 @@ $fixed_content = $fixer->fix('

Je suis "très content" de t\'avoir invité sur For your ease of use, you can find [ready to use list of Fixer for your language here](#fixer-recommendations-by-locale). Micro-typography is nothing like a standard or a law, what really matter is consistency, so feel free to use your own lists. +Also, be advise that JoliTypo is intended to be used on HTML contents (not pages) and will remove potential ``, `` and `` tags. + Installation ============ +Requirements are handled by Composer (libxml and mbstring are required). + ``` composer require jolicode/jolitypo 0.1.* ``` From d5be8bed11b678dca3f69557308c827eae8dfcf8 Mon Sep 17 00:00:00 2001 From: Damien Alexandre Date: Tue, 17 Jun 2014 09:52:49 +0200 Subject: [PATCH 3/5] Avoid fixing empty string, trim any content before fixing --- src/JoliTypo/Fixer.php | 7 ++++++- tests/JoliTypo/Tests/JoliTypoTest.php | 10 ++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/JoliTypo/Fixer.php b/src/JoliTypo/Fixer.php index 01c35b0..0accf7a 100644 --- a/src/JoliTypo/Fixer.php +++ b/src/JoliTypo/Fixer.php @@ -62,10 +62,15 @@ public function __construct($rules) */ public function fix($content) { + $trimmed = trim($content); + if (empty($trimmed)) { + return $content; + } + // Get a clean new StateBag $this->state_bag = new StateBag(); - $dom = $this->loadDOMDocument($content); + $dom = $this->loadDOMDocument($trimmed); $this->processDOM($dom, $dom); diff --git a/tests/JoliTypo/Tests/JoliTypoTest.php b/tests/JoliTypo/Tests/JoliTypoTest.php index 7cf1974..5fd8b9a 100644 --- a/tests/JoliTypo/Tests/JoliTypoTest.php +++ b/tests/JoliTypo/Tests/JoliTypoTest.php @@ -141,6 +141,16 @@ public function testBadEncoding() $this->assertEquals("Mentions Légales", $fixer->fix($isoString)); $this->assertEquals("Mentions Légales", $fixer->fix(utf8_encode(utf8_encode($isoString)))); } + + public function testEmptyContent() + { + $fixer = new Fixer(array('Trademark')); + $this->assertInstanceOf('JoliTypo\Fixer', $fixer); + + $this->assertEquals("", $fixer->fix("")); + $this->assertEquals("\n ", $fixer->fix("\n ")); + $this->assertEquals("some content", $fixer->fix("\n some content")); + } } class FakeFixer {} From 95c05c17bb01801c85e1d7b35067703eefbc0eaf Mon Sep 17 00:00:00 2001 From: Damien Alexandre Date: Tue, 17 Jun 2014 14:59:24 +0200 Subject: [PATCH 4/5] Complete the CHANGELOG with 0.1.4 changes --- CHANGELOG.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52f5cc0..0eb7bd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,16 @@ CHANGELOG ========= -### 0.1... (????-??-??) ### +### ??? ### + +### 0.1.4 (2014-06-17) ### + +* add HHVM tests on travis +* add libxml to composer requirements +* set APC 3.1.11 as conflict (https://bugs.php.net/bug.php?id=62190) +* do not process empty contents +* apply `mb_convert_encoding($content, 'HTML-ENTITIES', $encoding)` on all contents to fix encoding +* workaround for old (2.6.32) libxml versions (#7) ### 0.1.3 (2013-11-15) ### From c87c681f78772665c8e89451d521dc90d06ba6a4 Mon Sep 17 00:00:00 2001 From: Damien Alexandre Date: Tue, 17 Jun 2014 15:05:49 +0200 Subject: [PATCH 5/5] Improve the hyphenator version requirement --- CHANGELOG.md | 1 + composer.json | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0eb7bd3..3347d38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ CHANGELOG * do not process empty contents * apply `mb_convert_encoding($content, 'HTML-ENTITIES', $encoding)` on all contents to fix encoding * workaround for old (2.6.32) libxml versions (#7) +* better Org_Heigl_Hyphenator version requirement ### 0.1.3 (2013-11-15) ### diff --git a/composer.json b/composer.json index 1c04760..d49589a 100644 --- a/composer.json +++ b/composer.json @@ -16,7 +16,7 @@ "php": ">=5.3.0", "ext-mbstring": "*", "lib-libxml": "*", - "org_heigl/hyphenator": ">=2.0.3" + "org_heigl/hyphenator": "~2.0.3" }, "conflict": { "ext-apc": "3.1.11"