Skip to content

Commit

Permalink
Update walkthrough
Browse files Browse the repository at this point in the history
  • Loading branch information
rushter committed Aug 31, 2019
1 parent 9f7b396 commit 4685c91
Showing 1 changed file with 181 additions and 20 deletions.
201 changes: 181 additions & 20 deletions examples/walkthrough.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
" <span id=\"vspan\"></span>\n",
" <h1>Welcome to selectolax tutorial</h1>\n",
" <div id=\"text\">\n",
" <p class='p3' style='display:none;'>Excepteur sint occaecat cupidatat non proident</p>\n",
" <p class='p3' style='display:none;'>Excepteur <i>sint</i> occaecat cupidatat non proident</p>\n",
" <p class='p3' vid>Lorem ipsum</p>\n",
" </div>\n",
" <div>\n",
Expand All @@ -39,7 +39,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"metadata": {
"scrolled": true
},
Expand All @@ -49,12 +49,12 @@
"output_type": "stream",
"text": [
"---------------------\n",
"Node: <p class=\"p3\" style=\"display:none;\">Excepteur sint occaecat cupidatat non proident</p>\n",
"Node: <p class=\"p3\" style=\"display:none;\">Excepteur <i>sint</i> occaecat cupidatat non proident</p>\n",
"attributes: {'class': 'p3', 'style': 'display:none;'}\n",
"node text: Excepteur sint occaecat cupidatat non proident\n",
"tag: p\n",
"parent tag: div\n",
"last child inside current node: Excepteur sint occaecat cupidatat non proident\n",
"last child inside current node: occaecat cupidatat non proident\n",
"---------------------\n",
"\n",
"---------------------\n",
Expand Down Expand Up @@ -151,8 +151,9 @@
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-7a26563add76>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mHTMLParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcss_first\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"p.p3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'not-found'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstrict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/selectolax-0.1.3-py3.6-macosx-10.7-x86_64.egg/selectolax/parser.pyx\u001b[0m in \u001b[0;36mselectolax.parser.HTMLParser.css_first\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m<ipython-input-6-7a26563add76>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mHTMLParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcss_first\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"p.p3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'not-found'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstrict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Projects/python/selectolax/selectolax/parser.pyx\u001b[0m in \u001b[0;36mselectolax.parser.HTMLParser.css_first\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/Projects/python/selectolax/selectolax/node.pxi\u001b[0m in \u001b[0;36mselectolax.parser.Node.css_first\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: Expected 1 match, but found 2 matches"
]
}
Expand All @@ -170,7 +171,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -230,8 +231,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
"p <p class=\"p3\" style=\"display:none;\">Excepteur sint occaecat cupidatat non proident</p>\n",
"p <p class=\"p3\" vid=\"\">Lorem ipsum</p>\n"
"-text \n",
" \n",
"p <p class=\"p3\" style=\"display:none;\">Excepteur <i>sint</i> occaecat cupidatat non proident</p>\n",
"-text \n",
" \n",
"p <p class=\"p3\" vid=\"\">Lorem ipsum</p>\n",
"-text \n",
" \n"
]
}
],
Expand All @@ -250,7 +257,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -279,6 +286,160 @@
"print(html_parser.body.html)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Tag unwrapping"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"<body>\n",
" <span id=\"vspan\"></span>\n",
" <h1>Welcome to selectolax tutorial</h1>\n",
" <div id=\"text\">\n",
" <p class='p3' style='display:none;'>Excepteur <i>sint</i> occaecat cupidatat non proident</p>\n",
" <p class='p3' vid>Lorem ipsum</p>\n",
" </div>\n",
" <div>\n",
" <p id='stext'>Lorem ipsum dolor sit amet, ea quo modus meliore platonem.</p>\n",
" </div>\n",
"</body>\n",
"\n"
]
}
],
"source": [
"print(html)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<body>\n",
" <span id=\"vspan\"></span>\n",
" <h1>Welcome to selectolax tutorial</h1>\n",
" <div id=\"text\">\n",
" Excepteur sint occaecat cupidatat non proident\n",
" Lorem ipsum\n",
" </div>\n",
" <div>\n",
" Lorem ipsum dolor sit amet, ea quo modus meliore platonem.\n",
" </div>\n",
"\n",
"</body>\n"
]
}
],
"source": [
"html_parser = HTMLParser(html)\n",
"html_parser.unwrap_tags(['p', 'i'])\n",
"print(html_parser.body.html)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Attribute manipulation"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'data': 'secrect data', 'id': 'new_id'}\n",
"{'data': 'secrect data'}\n",
"<div data=\"secrect data\">\n",
" <p class=\"p3\" style=\"display:none;\">Excepteur <i>sint</i> occaecat cupidatat non proident</p>\n",
" <p class=\"p3\" vid=\"\">Lorem ipsum</p>\n",
" </div>\n"
]
}
],
"source": [
"html_parser = HTMLParser(html)\n",
"node = html_parser.css_first('div#text')\n",
"node.attrs['data'] = 'secrect data'\n",
"node.attrs['id'] = 'new_id'\n",
"print(node.attributes)\n",
"del node.attrs['id']\n",
"print(node.attributes)\n",
"print(node.html)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Tree traversal"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-undef\n",
"html\n",
"head\n",
"body\n",
"span\n",
"h1\n",
"Welcome to selectolax tutorial\n",
"div\n",
"p\n",
"i\n",
"sint\n",
"occaecat cupidatat non proident\n",
"p\n",
"Lorem ipsum\n",
"div\n",
"p\n",
"Lorem ipsum dolor sit amet, ea quo modus meliore platonem.\n"
]
}
],
"source": [
"html_parser = HTMLParser(html)\n",
"for node in html_parser.root.traverse():\n",
"\n",
" if node.tag == '-text':\n",
" text = node.text(deep=True).strip()\n",
" if text:\n",
" print(text)\n",
" else:\n",
" print(node.tag)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -288,7 +449,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -299,7 +460,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 16,
"metadata": {},
"outputs": [
{
Expand All @@ -308,7 +469,7 @@
"b'<div>\\xcf\\xf0\\xe8\\xe2\\xe5\\xf2 \\xec\\xe8\\xf0!</div>'"
]
},
"execution_count": 11,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -319,7 +480,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand All @@ -328,7 +489,7 @@
"'WINDOWS-1251'"
]
},
"execution_count": 12,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -346,7 +507,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand All @@ -355,7 +516,7 @@
"'WINDOWS-1251'"
]
},
"execution_count": 13,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -367,7 +528,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 19,
"metadata": {},
"outputs": [
{
Expand All @@ -376,7 +537,7 @@
"'WINDOWS-1251'"
]
},
"execution_count": 14,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -403,7 +564,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"version": "3.7.3"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 4685c91

Please sign in to comment.