From 44f0f69a9a385de904a80bd62f78590be65717b1 Mon Sep 17 00:00:00 2001 From: Ray Luo Date: Fri, 9 Feb 2018 15:23:58 -0800 Subject: [PATCH 1/3] Use regex to grab xml content as-is --- adal/wstrust_response.py | 30 +++++++++++++++++++++++++++++- tests/test_wstrust_response.py | 18 ++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/adal/wstrust_response.py b/adal/wstrust_response.py index ecdc398f..cfc57c3f 100644 --- a/adal/wstrust_response.py +++ b/adal/wstrust_response.py @@ -55,6 +55,20 @@ def scrub_rstr_log_message(response_str): return 'RSTR Response: ' + scrubbed_rstr +def findall_content(xml_string, tag): + """ + Given a tag name without any prefix, + this function returns a list of the raw content inside this tag as-is. + + >>> findall_content(" what ever content ", "foo") + [" what ever content "] + + """ + # https://www.w3.org/TR/REC-xml/#NT-NameChar + pattern = r"<(?:\w+:)?%(tag)s>(.*) ever content +in multiple lines +""" + sample = "" + content + "" + self.assertEqual([content], findall_content(sample, "foo")) + self.assertEqual([], findall_content(sample, "nonexist")) + + def test_findall_content_for_real(self): + with open(os.path.join(os.getcwd(), 'tests', 'wstrust', 'RSTR.xml')) as f: + rstr = f.read() + wstrustResponse = WSTrustResponse(_call_context, rstr, WSTrustVersion.WSTRUST13) + wstrustResponse.parse() + self.assertIn("", rstr) + self.assertIn(b"", wstrustResponse.token) # It is in bytes + if __name__ == '__main__': unittest.main() From 7a8dc4581f54db50dae68288f9bf751c66a9ac11 Mon Sep 17 00:00:00 2001 From: Ray Luo Date: Tue, 20 Feb 2018 16:55:37 -0800 Subject: [PATCH 2/3] Add extra documentation and test the comparison --- adal/wstrust_response.py | 13 +++++++++++-- tests/test_wstrust_response.py | 25 ++++++++++++++++++------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/adal/wstrust_response.py b/adal/wstrust_response.py index cfc57c3f..33690ccb 100644 --- a/adal/wstrust_response.py +++ b/adal/wstrust_response.py @@ -63,9 +63,18 @@ def findall_content(xml_string, tag): >>> findall_content(" what ever content ", "foo") [" what ever content "] + Usually we would use XML parser to extract the data by xpath. + However the ElementTree in Python will implicitly normalize the output + by "hoisting" the inner inline namespaces into the outmost element. + The result will be a semantically equivalent XML snippet, + but not fully identical to the original one. + While this shouldn't become a problem, + in practice it could potentially confuse a picky recipient. + + Introducing this helper, based on Regex, which will return raw content as-is. """ - # https://www.w3.org/TR/REC-xml/#NT-NameChar - pattern = r"<(?:\w+:)?%(tag)s>(.*)]*)>(.*) ever content -in multiple lines -""" - sample = "" + content + "" - self.assertEqual([content], findall_content(sample, "foo")) - self.assertEqual([], findall_content(sample, "nonexist")) + + + foo + + """ + sample = ('' + + content + + '') + + # Demonstrating how XML-based parser won't give you the raw content as-is + element = ET.fromstring(sample).findall('{SAML:assertion}Assertion')[0] + assertion_via_xml_parser = ET.tostring(element) + self.assertNotEqual(content, assertion_via_xml_parser) + self.assertNotIn(b"", assertion_via_xml_parser) + + # The findall_content() helper, based on Regex, will return content as-is. + self.assertEqual([content], findall_content(sample, "Wrapper")) def test_findall_content_for_real(self): with open(os.path.join(os.getcwd(), 'tests', 'wstrust', 'RSTR.xml')) as f: From f449f2f7d8ae05dd2feb6debcb0d7e411e1bc2ab Mon Sep 17 00:00:00 2001 From: Ray Luo Date: Fri, 2 Mar 2018 12:14:37 -0800 Subject: [PATCH 3/3] Further clarify the rationale of this workaround based on research --- adal/wstrust_response.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/adal/wstrust_response.py b/adal/wstrust_response.py index 33690ccb..5b2f5eea 100644 --- a/adal/wstrust_response.py +++ b/adal/wstrust_response.py @@ -63,15 +63,21 @@ def findall_content(xml_string, tag): >>> findall_content(" what ever content ", "foo") [" what ever content "] + Motivation: + Usually we would use XML parser to extract the data by xpath. However the ElementTree in Python will implicitly normalize the output by "hoisting" the inner inline namespaces into the outmost element. The result will be a semantically equivalent XML snippet, but not fully identical to the original one. - While this shouldn't become a problem, - in practice it could potentially confuse a picky recipient. - - Introducing this helper, based on Regex, which will return raw content as-is. + While this effect shouldn't become a problem in all other cases, + it does not seem to fully comply with Exclusive XML Canonicalization spec + (https://www.w3.org/TR/xml-exc-c14n/), and void the SAML token signature. + SAML signature algo needs the "XML -> C14N(XML) -> Signed(C14N(Xml))" order. + + The binary extention lxml is probably the canonical way to solve this + (https://stackoverflow.com/questions/22959577/python-exclusive-xml-canonicalization-xml-exc-c14n) + but here we use this workaround, based on Regex, to return raw content as-is. """ # \w+ is good enough for https://www.w3.org/TR/REC-xml/#NT-NameChar pattern = r"<(?:\w+:)?%(tag)s(?:[^>]*)>(.*)