summaryrefslogtreecommitdiffhomepage
path: root/libs/bs4
diff options
context:
space:
mode:
authorLouis Vézina <[email protected]>2019-09-13 15:12:26 -0400
committerLouis Vézina <[email protected]>2019-09-13 15:12:26 -0400
commit645952c61aba3cccb5ca919be966a7ba02d853fa (patch)
tree005362d11c260b4293b427ed8ba9d94e8b478ec3 /libs/bs4
parent4e7e3a39d26cb3771876b43c82a747f6c6526dda (diff)
downloadbazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.tar.gz
bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.zip
WIP
Diffstat (limited to 'libs/bs4')
-rw-r--r--libs/bs4/AUTHORS.txt43
-rw-r--r--libs/bs4/COPYING.txt27
-rw-r--r--libs/bs4/NEWS.txt1190
-rw-r--r--libs/bs4/README.txt63
-rw-r--r--libs/bs4/TODO.txt31
-rw-r--r--libs/bs4/__init__.py267
-rw-r--r--libs/bs4/builder/__init__.py74
-rw-r--r--libs/bs4/builder/_html5lib.py40
-rw-r--r--libs/bs4/builder/_htmlparser.py72
-rw-r--r--libs/bs4/builder/_lxml.py90
-rw-r--r--libs/bs4/dammit.py40
-rw-r--r--libs/bs4/diagnose.py77
-rw-r--r--libs/bs4/element.py859
-rw-r--r--libs/bs4/testing.py270
-rw-r--r--libs/bs4/tests/test_html5lib.py50
-rw-r--r--libs/bs4/tests/test_htmlparser.py19
-rw-r--r--libs/bs4/tests/test_lxml.py32
-rw-r--r--libs/bs4/tests/test_soup.py134
-rw-r--r--libs/bs4/tests/test_tree.py301
19 files changed, 1412 insertions, 2267 deletions
diff --git a/libs/bs4/AUTHORS.txt b/libs/bs4/AUTHORS.txt
deleted file mode 100644
index 2ac8fcc8c..000000000
--- a/libs/bs4/AUTHORS.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-Behold, mortal, the origins of Beautiful Soup...
-================================================
-
-Leonard Richardson is the primary programmer.
-
-Aaron DeVore is awesome.
-
-Mark Pilgrim provided the encoding detection code that forms the base
-of UnicodeDammit.
-
-Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
-Soup 4 working under Python 3.
-
-Simon Willison wrote soupselect, which was used to make Beautiful Soup
-support CSS selectors.
-
-Sam Ruby helped with a lot of edge cases.
-
-Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
-work in solving the nestable tags conundrum.
-
-An incomplete list of people have contributed patches to Beautiful
-Soup:
-
- Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
- Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
- Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
- Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
- Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
- Samastur, Jouni Sepp�nen, Alexander Schmolck, Andy Theyers, Glyn
- Webster, Paul Wright, Danny Yoo
-
-An incomplete list of people who made suggestions or found bugs or
-found ways to break Beautiful Soup:
-
- Hanno B�ck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
- Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
- Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
- warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
- Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
- Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
- Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
- Sousa Rocha, Yichun Wei, Per Vognsen
diff --git a/libs/bs4/COPYING.txt b/libs/bs4/COPYING.txt
deleted file mode 100644
index b91188869..000000000
--- a/libs/bs4/COPYING.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Beautiful Soup is made available under the MIT license:
-
- Copyright (c) 2004-2015 Leonard Richardson
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-
-Beautiful Soup incorporates code from the html5lib library, which is
-also made available under the MIT license. Copyright (c) 2006-2013
-James Graham and other contributors
diff --git a/libs/bs4/NEWS.txt b/libs/bs4/NEWS.txt
deleted file mode 100644
index 3726c570a..000000000
--- a/libs/bs4/NEWS.txt
+++ /dev/null
@@ -1,1190 +0,0 @@
-= 4.4.1 (20150928) =
-
-* Fixed a bug that deranged the tree when part of it was
- removed. Thanks to Eric Weiser for the patch and John Wiseman for a
- test. [bug=1481520]
-
-* Fixed a parse bug with the html5lib tree-builder. Thanks to Roel
- Kramer for the patch. [bug=1483781]
-
-* Improved the implementation of CSS selector grouping. Thanks to
- Orangain for the patch. [bug=1484543]
-
-* Fixed the test_detect_utf8 test so that it works when chardet is
- installed. [bug=1471359]
-
-* Corrected the output of Declaration objects. [bug=1477847]
-
-
-= 4.4.0 (20150703) =
-
-Especially important changes:
-
-* Added a warning when you instantiate a BeautifulSoup object without
- explicitly naming a parser. [bug=1398866]
-
-* __repr__ now returns an ASCII bytestring in Python 2, and a Unicode
- string in Python 3, instead of a UTF8-encoded bytestring in both
- versions. In Python 3, __str__ now returns a Unicode string instead
- of a bytestring. [bug=1420131]
-
-* The `text` argument to the find_* methods is now called `string`,
- which is more accurate. `text` still works, but `string` is the
- argument described in the documentation. `text` may eventually
- change its meaning, but not for a very long time. [bug=1366856]
-
-* Changed the way soup objects work under copy.copy(). Copying a
- NavigableString or a Tag will give you a new NavigableString that's
- equal to the old one but not connected to the parse tree. Patch by
- Martijn Peters. [bug=1307490]
-
-* Started using a standard MIT license. [bug=1294662]
-
-* Added a Chinese translation of the documentation by Delong .w.
-
-New features:
-
-* Introduced the select_one() method, which uses a CSS selector but
- only returns the first match, instead of a list of
- matches. [bug=1349367]
-
-* You can now create a Tag object without specifying a
- TreeBuilder. Patch by Martijn Pieters. [bug=1307471]
-
-* You can now create a NavigableString or a subclass just by invoking
- the constructor. [bug=1294315]
-
-* Added an `exclude_encodings` argument to UnicodeDammit and to the
- Beautiful Soup constructor, which lets you prohibit the detection of
- an encoding that you know is wrong. [bug=1469408]
-
-* The select() method now supports selector grouping. Patch by
- Francisco Canas [bug=1191917]
-
-Bug fixes:
-
-* Fixed yet another problem that caused the html5lib tree builder to
- create a disconnected parse tree. [bug=1237763]
-
-* Force object_was_parsed() to keep the tree intact even when an element
- from later in the document is moved into place. [bug=1430633]
-
-* Fixed yet another bug that caused a disconnected tree when html5lib
- copied an element from one part of the tree to another. [bug=1270611]
-
-* Fixed a bug where Element.extract() could create an infinite loop in
- the remaining tree.
-
-* The select() method can now find tags whose names contain
- dashes. Patch by Francisco Canas. [bug=1276211]
-
-* The select() method can now find tags with attributes whose names
- contain dashes. Patch by Marek Kapolka. [bug=1304007]
-
-* Improved the lxml tree builder's handling of processing
- instructions. [bug=1294645]
-
-* Restored the helpful syntax error that happens when you try to
- import the Python 2 edition of Beautiful Soup under Python
- 3. [bug=1213387]
-
-* In Python 3.4 and above, set the new convert_charrefs argument to
- the html.parser constructor to avoid a warning and future
- failures. Patch by Stefano Revera. [bug=1375721]
-
-* The warning when you pass in a filename or URL as markup will now be
- displayed correctly even if the filename or URL is a Unicode
- string. [bug=1268888]
-
-* If the initial <html> tag contains a CDATA list attribute such as
- 'class', the html5lib tree builder will now turn its value into a
- list, as it would with any other tag. [bug=1296481]
-
-* Fixed an import error in Python 3.5 caused by the removal of the
- HTMLParseError class. [bug=1420063]
-
-* Improved docstring for encode_contents() and
- decode_contents(). [bug=1441543]
-
-* Fixed a crash in Unicode, Dammit's encoding detector when the name
- of the encoding itself contained invalid bytes. [bug=1360913]
-
-* Improved the exception raised when you call .unwrap() or
- .replace_with() on an element that's not attached to a tree.
-
-* Raise a NotImplementedError whenever an unsupported CSS pseudoclass
- is used in select(). Previously some cases did not result in a
- NotImplementedError.
-
-* It's now possible to pickle a BeautifulSoup object no matter which
- tree builder was used to create it. However, the only tree builder
- that survives the pickling process is the HTMLParserTreeBuilder
- ('html.parser'). If you unpickle a BeautifulSoup object created with
- some other tree builder, soup.builder will be None. [bug=1231545]
-
-= 4.3.2 (20131002) =
-
-* Fixed a bug in which short Unicode input was improperly encoded to
- ASCII when checking whether or not it was the name of a file on
- disk. [bug=1227016]
-
-* Fixed a crash when a short input contains data not valid in
- filenames. [bug=1232604]
-
-* Fixed a bug that caused Unicode data put into UnicodeDammit to
- return None instead of the original data. [bug=1214983]
-
-* Combined two tests to stop a spurious test failure when tests are
- run by nosetests. [bug=1212445]
-
-= 4.3.1 (20130815) =
-
-* Fixed yet another problem with the html5lib tree builder, caused by
- html5lib's tendency to rearrange the tree during
- parsing. [bug=1189267]
-
-* Fixed a bug that caused the optimized version of find_all() to
- return nothing. [bug=1212655]
-
-= 4.3.0 (20130812) =
-
-* Instead of converting incoming data to Unicode and feeding it to the
- lxml tree builder in chunks, Beautiful Soup now makes successive
- guesses at the encoding of the incoming data, and tells lxml to
- parse the data as that encoding. Giving lxml more control over the
- parsing process improves performance and avoids a number of bugs and
- issues with the lxml parser which had previously required elaborate
- workarounds:
-
- - An issue in which lxml refuses to parse Unicode strings on some
- systems. [bug=1180527]
-
- - A returning bug that truncated documents longer than a (very
- small) size. [bug=963880]
-
- - A returning bug in which extra spaces were added to a document if
- the document defined a charset other than UTF-8. [bug=972466]
-
- This required a major overhaul of the tree builder architecture. If
- you wrote your own tree builder and didn't tell me, you'll need to
- modify your prepare_markup() method.
-
-* The UnicodeDammit code that makes guesses at encodings has been
- split into its own class, EncodingDetector. A lot of apparently
- redundant code has been removed from Unicode, Dammit, and some
- undocumented features have also been removed.
-
-* Beautiful Soup will issue a warning if instead of markup you pass it
- a URL or the name of a file on disk (a common beginner's mistake).
-
-* A number of optimizations improve the performance of the lxml tree
- builder by about 33%, the html.parser tree builder by about 20%, and
- the html5lib tree builder by about 15%.
-
-* All find_all calls should now return a ResultSet object. Patch by
- Aaron DeVore. [bug=1194034]
-
-= 4.2.1 (20130531) =
-
-* The default XML formatter will now replace ampersands even if they
- appear to be part of entities. That is, "&lt;" will become
- "&amp;lt;". The old code was left over from Beautiful Soup 3, which
- didn't always turn entities into Unicode characters.
-
- If you really want the old behavior (maybe because you add new
- strings to the tree, those strings include entities, and you want
- the formatter to leave them alone on output), it can be found in
- EntitySubstitution.substitute_xml_containing_entities(). [bug=1182183]
-
-* Gave new_string() the ability to create subclasses of
- NavigableString. [bug=1181986]
-
-* Fixed another bug by which the html5lib tree builder could create a
- disconnected tree. [bug=1182089]
-
-* The .previous_element of a BeautifulSoup object is now always None,
- not the last element to be parsed. [bug=1182089]
-
-* Fixed test failures when lxml is not installed. [bug=1181589]
-
-* html5lib now supports Python 3. Fixed some Python 2-specific
- code in the html5lib test suite. [bug=1181624]
-
-* The html.parser treebuilder can now handle numeric attributes in
- text when the hexidecimal name of the attribute starts with a
- capital X. Patch by Tim Shirley. [bug=1186242]
-
-= 4.2.0 (20130514) =
-
-* The Tag.select() method now supports a much wider variety of CSS
- selectors.
-
- - Added support for the adjacent sibling combinator (+) and the
- general sibling combinator (~). Tests by "liquider". [bug=1082144]
-
- - The combinators (>, +, and ~) can now combine with any supported
- selector, not just one that selects based on tag name.
-
- - Added limited support for the "nth-of-type" pseudo-class. Code
- by Sven Slootweg. [bug=1109952]
-
-* The BeautifulSoup class is now aliased to "_s" and "_soup", making
- it quicker to type the import statement in an interactive session:
-
- from bs4 import _s
- or
- from bs4 import _soup
-
- The alias may change in the future, so don't use this in code you're
- going to run more than once.
-
-* Added the 'diagnose' submodule, which includes several useful
- functions for reporting problems and doing tech support.
-
- - diagnose(data) tries the given markup on every installed parser,
- reporting exceptions and displaying successes. If a parser is not
- installed, diagnose() mentions this fact.
-
- - lxml_trace(data, html=True) runs the given markup through lxml's
- XML parser or HTML parser, and prints out the parser events as
- they happen. This helps you quickly determine whether a given
- problem occurs in lxml code or Beautiful Soup code.
-
- - htmlparser_trace(data) is the same thing, but for Python's
- built-in HTMLParser class.
-
-* In an HTML document, the contents of a <script> or <style> tag will
- no longer undergo entity substitution by default. XML documents work
- the same way they did before. [bug=1085953]
-
-* Methods like get_text() and properties like .strings now only give
- you strings that are visible in the document--no comments or
- processing commands. [bug=1050164]
-
-* The prettify() method now leaves the contents of <pre> tags
- alone. [bug=1095654]
-
-* Fix a bug in the html5lib treebuilder which sometimes created
- disconnected trees. [bug=1039527]
-
-* Fix a bug in the lxml treebuilder which crashed when a tag included
- an attribute from the predefined "xml:" namespace. [bug=1065617]
-
-* Fix a bug by which keyword arguments to find_parent() were not
- being passed on. [bug=1126734]
-
-* Stop a crash when unwisely messing with a tag that's been
- decomposed. [bug=1097699]
-
-* Now that lxml's segfault on invalid doctype has been fixed, fixed a
- corresponding problem on the Beautiful Soup end that was previously
- invisible. [bug=984936]
-
-* Fixed an exception when an overspecified CSS selector didn't match
- anything. Code by Stefaan Lippens. [bug=1168167]
-
-= 4.1.3 (20120820) =
-
-* Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious
- test failure caused by the lousy HTMLParser in those
- versions. [bug=1038503]
-
-* Raise a more specific error (FeatureNotFound) when a requested
- parser or parser feature is not installed. Raise NotImplementedError
- instead of ValueError when the user calls insert_before() or
- insert_after() on the BeautifulSoup object itself. Patch by Aaron
- Devore. [bug=1038301]
-
-= 4.1.2 (20120817) =
-
-* As per PEP-8, allow searching by CSS class using the 'class_'
- keyword argument. [bug=1037624]
-
-* Display namespace prefixes for namespaced attribute names, instead of
- the fully-qualified names given by the lxml parser. [bug=1037597]
-
-* Fixed a crash on encoding when an attribute name contained
- non-ASCII characters.
-
-* When sniffing encodings, if the cchardet library is installed,
- Beautiful Soup uses it instead of chardet. cchardet is much
- faster. [bug=1020748]
-
-* Use logging.warning() instead of warning.warn() to notify the user
- that characters were replaced with REPLACEMENT
- CHARACTER. [bug=1013862]
-
-= 4.1.1 (20120703) =
-
-* Fixed an html5lib tree builder crash which happened when html5lib
- moved a tag with a multivalued attribute from one part of the tree
- to another. [bug=1019603]
-
-* Correctly display closing tags with an XML namespace declared. Patch
- by Andreas Kostyrka. [bug=1019635]
-
-* Fixed a typo that made parsing significantly slower than it should
- have been, and also waited too long to close tags with XML
- namespaces. [bug=1020268]
-
-* get_text() now returns an empty Unicode string if there is no text,
- rather than an empty bytestring. [bug=1020387]
-
-= 4.1.0 (20120529) =
-
-* Added experimental support for fixing Windows-1252 characters
- embedded in UTF-8 documents. (UnicodeDammit.detwingle())
-
-* Fixed the handling of &quot; with the built-in parser. [bug=993871]
-
-* Comments, processing instructions, document type declarations, and
- markup declarations are now treated as preformatted strings, the way
- CData blocks are. [bug=1001025]
-
-* Fixed a bug with the lxml treebuilder that prevented the user from
- adding attributes to a tag that didn't originally have
- attributes. [bug=1002378] Thanks to Oliver Beattie for the patch.
-
-* Fixed some edge-case bugs having to do with inserting an element
- into a tag it's already inside, and replacing one of a tag's
- children with another. [bug=997529]
-
-* Added the ability to search for attribute values specified in UTF-8. [bug=1003974]
-
- This caused a major refactoring of the search code. All the tests
- pass, but it's possible that some searches will behave differently.
-
-= 4.0.5 (20120427) =
-
-* Added a new method, wrap(), which wraps an element in a tag.
-
-* Renamed replace_with_children() to unwrap(), which is easier to
- understand and also the jQuery name of the function.
-
-* Made encoding substitution in <meta> tags completely transparent (no
- more %SOUP-ENCODING%).
-
-* Fixed a bug in decoding data that contained a byte-order mark, such
- as data encoded in UTF-16LE. [bug=988980]
-
-* Fixed a bug that made the HTMLParser treebuilder generate XML
- definitions ending with two question marks instead of
- one. [bug=984258]
-
-* Upon document generation, CData objects are no longer run through
- the formatter. [bug=988905]
-
-* The test suite now passes when lxml is not installed, whether or not
- html5lib is installed. [bug=987004]
-
-* Print a warning on HTMLParseErrors to let people know they should
- install a better parser library.
-
-= 4.0.4 (20120416) =
-
-* Fixed a bug that sometimes created disconnected trees.
-
-* Fixed a bug with the string setter that moved a string around the
- tree instead of copying it. [bug=983050]
-
-* Attribute values are now run through the provided output formatter.
- Previously they were always run through the 'minimal' formatter. In
- the future I may make it possible to specify different formatters
- for attribute values and strings, but for now, consistent behavior
- is better than inconsistent behavior. [bug=980237]
-
-* Added the missing renderContents method from Beautiful Soup 3. Also
- added an encode_contents() method to go along with decode_contents().
-
-* Give a more useful error when the user tries to run the Python 2
- version of BS under Python 3.
-
-* UnicodeDammit can now convert Microsoft smart quotes to ASCII with
- UnicodeDammit(markup, smart_quotes_to="ascii").
-
-= 4.0.3 (20120403) =
-
-* Fixed a typo that caused some versions of Python 3 to convert the
- Beautiful Soup codebase incorrectly.
-
-* Got rid of the 4.0.2 workaround for HTML documents--it was
- unnecessary and the workaround was triggering a (possibly different,
- but related) bug in lxml. [bug=972466]
-
-= 4.0.2 (20120326) =
-
-* Worked around a possible bug in lxml that prevents non-tiny XML
- documents from being parsed. [bug=963880, bug=963936]
-
-* Fixed a bug where specifying `text` while also searching for a tag
- only worked if `text` wanted an exact string match. [bug=955942]
-
-= 4.0.1 (20120314) =
-
-* This is the first official release of Beautiful Soup 4. There is no
- 4.0.0 release, to eliminate any possibility that packaging software
- might treat "4.0.0" as being an earlier version than "4.0.0b10".
-
-* Brought BS up to date with the latest release of soupselect, adding
- CSS selector support for direct descendant matches and multiple CSS
- class matches.
-
-= 4.0.0b10 (20120302) =
-
-* Added support for simple CSS selectors, taken from the soupselect project.
-
-* Fixed a crash when using html5lib. [bug=943246]
-
-* In HTML5-style <meta charset="foo"> tags, the value of the "charset"
- attribute is now replaced with the appropriate encoding on
- output. [bug=942714]
-
-* Fixed a bug that caused calling a tag to sometimes call find_all()
- with the wrong arguments. [bug=944426]
-
-* For backwards compatibility, brought back the BeautifulStoneSoup
- class as a deprecated wrapper around BeautifulSoup.
-
-= 4.0.0b9 (20120228) =
-
-* Fixed the string representation of DOCTYPEs that have both a public
- ID and a system ID.
-
-* Fixed the generated XML declaration.
-
-* Renamed Tag.nsprefix to Tag.prefix, for consistency with
- NamespacedAttribute.
-
-* Fixed a test failure that occured on Python 3.x when chardet was
- installed.
-
-* Made prettify() return Unicode by default, so it will look nice on
- Python 3 when passed into print().
-
-= 4.0.0b8 (20120224) =
-
-* All tree builders now preserve namespace information in the
- documents they parse. If you use the html5lib parser or lxml's XML
- parser, you can access the namespace URL for a tag as tag.namespace.
-
- However, there is no special support for namespace-oriented
- searching or tree manipulation. When you search the tree, you need
- to use namespace prefixes exactly as they're used in the original
- document.
-
-* The string representation of a DOCTYPE always ends in a newline.
-
-* Issue a warning if the user tries to use a SoupStrainer in
- conjunction with the html5lib tree builder, which doesn't support
- them.
-
-= 4.0.0b7 (20120223) =
-
-* Upon decoding to string, any characters that can't be represented in
- your chosen encoding will be converted into numeric XML entity
- references.
-
-* Issue a warning if characters were replaced with REPLACEMENT
- CHARACTER during Unicode conversion.
-
-* Restored compatibility with Python 2.6.
-
-* The install process no longer installs docs or auxillary text files.
-
-* It's now possible to deepcopy a BeautifulSoup object created with
- Python's built-in HTML parser.
-
-* About 100 unit tests that "test" the behavior of various parsers on
- invalid markup have been removed. Legitimate changes to those
- parsers caused these tests to fail, indicating that perhaps
- Beautiful Soup should not test the behavior of foreign
- libraries.
-
- The problematic unit tests have been reformulated as informational
- comparisons generated by the script
- scripts/demonstrate_parser_differences.py.
-
- This makes Beautiful Soup compatible with html5lib version 0.95 and
- future versions of HTMLParser.
-
-= 4.0.0b6 (20120216) =
-
-* Multi-valued attributes like "class" always have a list of values,
- even if there's only one value in the list.
-
-* Added a number of multi-valued attributes defined in HTML5.
-
-* Stopped generating a space before the slash that closes an
- empty-element tag. This may come back if I add a special XHTML mode
- (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty
- useless.
-
-* Passing text along with tag-specific arguments to a find* method:
-
- find("a", text="Click here")
-
- will find tags that contain the given text as their
- .string. Previously, the tag-specific arguments were ignored and
- only strings were searched.
-
-* Fixed a bug that caused the html5lib tree builder to build a
- partially disconnected tree. Generally cleaned up the html5lib tree
- builder.
-
-* If you restrict a multi-valued attribute like "class" to a string
- that contains spaces, Beautiful Soup will only consider it a match
- if the values correspond to that specific string.
-
-= 4.0.0b5 (20120209) =
-
-* Rationalized Beautiful Soup's treatment of CSS class. A tag
- belonging to multiple CSS classes is treated as having a list of
- values for the 'class' attribute. Searching for a CSS class will
- match *any* of the CSS classes.
-
- This actually affects all attributes that the HTML standard defines
- as taking multiple values (class, rel, rev, archive, accept-charset,
- and headers), but 'class' is by far the most common. [bug=41034]
-
-* If you pass anything other than a dictionary as the second argument
- to one of the find* methods, it'll assume you want to use that
- object to search against a tag's CSS classes. Previously this only
- worked if you passed in a string.
-
-* Fixed a bug that caused a crash when you passed a dictionary as an
- attribute value (possibly because you mistyped "attrs"). [bug=842419]
-
-* Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags
- like <meta charset="utf-8" />. [bug=837268]
-
-* If Unicode, Dammit can't figure out a consistent encoding for a
- page, it will try each of its guesses again, with errors="replace"
- instead of errors="strict". This may mean that some data gets
- replaced with REPLACEMENT CHARACTER, but at least most of it will
- get turned into Unicode. [bug=754903]
-
-* Patched over a bug in html5lib (?) that was crashing Beautiful Soup
- on certain kinds of markup. [bug=838800]
-
-* Fixed a bug that wrecked the tree if you replaced an element with an
- empty string. [bug=728697]
-
-* Improved Unicode, Dammit's behavior when you give it Unicode to
- begin with.
-
-= 4.0.0b4 (20120208) =
-
-* Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag()
-
-* BeautifulSoup.new_tag() will follow the rules of whatever
- tree-builder was used to create the original BeautifulSoup object. A
- new <p> tag will look like "<p />" if the soup object was created to
- parse XML, but it will look like "<p></p>" if the soup object was
- created to parse HTML.
-
-* We pass in strict=False to html.parser on Python 3, greatly
- improving html.parser's ability to handle bad HTML.
-
-* We also monkeypatch a serious bug in html.parser that made
- strict=False disastrous on Python 3.2.2.
-
-* Replaced the "substitute_html_entities" argument with the
- more general "formatter" argument.
-
-* Bare ampersands and angle brackets are always converted to XML
- entities unless the user prevents it.
-
-* Added PageElement.insert_before() and PageElement.insert_after(),
- which let you put an element into the parse tree with respect to
- some other element.
-
-* Raise an exception when the user tries to do something nonsensical
- like insert a tag into itself.
-
-
-= 4.0.0b3 (20120203) =
-
-Beautiful Soup 4 is a nearly-complete rewrite that removes Beautiful
-Soup's custom HTML parser in favor of a system that lets you write a
-little glue code and plug in any HTML or XML parser you want.
-
-Beautiful Soup 4.0 comes with glue code for four parsers:
-
- * Python's standard HTMLParser (html.parser in Python 3)
- * lxml's HTML and XML parsers
- * html5lib's HTML parser
-
-HTMLParser is the default, but I recommend you install lxml if you
-can.
-
-For complete documentation, see the Sphinx documentation in
-bs4/doc/source/. What follows is a summary of the changes from
-Beautiful Soup 3.
-
-=== The module name has changed ===
-
-Previously you imported the BeautifulSoup class from a module also
-called BeautifulSoup. To save keystrokes and make it clear which
-version of the API is in use, the module is now called 'bs4':
-
- >>> from bs4 import BeautifulSoup
-
-=== It works with Python 3 ===
-
-Beautiful Soup 3.1.0 worked with Python 3, but the parser it used was
-so bad that it barely worked at all. Beautiful Soup 4 works with
-Python 3, and since its parser is pluggable, you don't sacrifice
-quality.
-
-Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3
-support to the finish line. Ezio Melotti is also to thank for greatly
-improving the HTML parser that comes with Python 3.2.
-
-=== CDATA sections are normal text, if they're understood at all. ===
-
-Currently, the lxml and html5lib HTML parsers ignore CDATA sections in
-markup:
-
- <p><![CDATA[foo]]></p> => <p></p>
-
-A future version of html5lib will turn CDATA sections into text nodes,
-but only within tags like <svg> and <math>:
-
- <svg><![CDATA[foo]]></svg> => <p>foo</p>
-
-The default XML parser (which uses lxml behind the scenes) turns CDATA
-sections into ordinary text elements:
-
- <p><![CDATA[foo]]></p> => <p>foo</p>
-
-In theory it's possible to preserve the CDATA sections when using the
-XML parser, but I don't see how to get it to work in practice.
-
-=== Miscellaneous other stuff ===
-
-If the BeautifulSoup instance has .is_xml set to True, an appropriate
-XML declaration will be emitted when the tree is transformed into a
-string:
-
- <?xml version="1.0" encoding="utf-8">
- <markup>
- ...
- </markup>
-
-The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree
-builders set it to False. If you want to parse XHTML with an HTML
-parser, you can set it manually.
-
-
-= 3.2.0 =
-
-The 3.1 series wasn't very useful, so I renamed the 3.0 series to 3.2
-to make it obvious which one you should use.
-
-= 3.1.0 =
-
-A hybrid version that supports 2.4 and can be automatically converted
-to run under Python 3.0. There are three backwards-incompatible
-changes you should be aware of, but no new features or deliberate
-behavior changes.
-
-1. str() may no longer do what you want. This is because the meaning
-of str() inverts between Python 2 and 3; in Python 2 it gives you a
-byte string, in Python 3 it gives you a Unicode string.
-
-The effect of this is that you can't pass an encoding to .__str__
-anymore. Use encode() to get a string and decode() to get Unicode, and
-you'll be ready (well, readier) for Python 3.
-
-2. Beautiful Soup is now based on HTMLParser rather than SGMLParser,
-which is gone in Python 3. There's some bad HTML that SGMLParser
-handled but HTMLParser doesn't, usually to do with attribute values
-that aren't closed or have brackets inside them:
-
- <a href="foo</a>, </a><a href="bar">baz</a>
- <a b="<a>">', '<a b="&lt;a&gt;"></a><a>"></a>
-
-A later version of Beautiful Soup will allow you to plug in different
-parsers to make tradeoffs between speed and the ability to handle bad
-HTML.
-
-3. In Python 3 (but not Python 2), HTMLParser converts entities within
-attributes to the corresponding Unicode characters. In Python 2 it's
-possible to parse this string and leave the &eacute; intact.
-
- <a href="http://crummy.com?sacr&eacute;&bleu">
-
-In Python 3, the &eacute; is always converted to \xe9 during
-parsing.
-
-
-= 3.0.7a =
-
-Added an import that makes BS work in Python 2.3.
-
-
-= 3.0.7 =
-
-Fixed a UnicodeDecodeError when unpickling documents that contain
-non-ASCII characters.
-
-Fixed a TypeError that occured in some circumstances when a tag
-contained no text.
-
-Jump through hoops to avoid the use of chardet, which can be extremely
-slow in some circumstances. UTF-8 documents should never trigger the
-use of chardet.
-
-Whitespace is preserved inside <pre> and <textarea> tags that contain
-nothing but whitespace.
-
-Beautiful Soup can now parse a doctype that's scoped to an XML namespace.
-
-
-= 3.0.6 =
-
-Got rid of a very old debug line that prevented chardet from working.
-
-Added a Tag.decompose() method that completely disconnects a tree or a
-subset of a tree, breaking it up into bite-sized pieces that are
-easy for the garbage collecter to collect.
-
-Tag.extract() now returns the tag that was extracted.
-
-Tag.findNext() now does something with the keyword arguments you pass
-it instead of dropping them on the floor.
-
-Fixed a Unicode conversion bug.
-
-Fixed a bug that garbled some <meta> tags when rewriting them.
-
-
-= 3.0.5 =
-
-Soup objects can now be pickled, and copied with copy.deepcopy.
-
-Tag.append now works properly on existing BS objects. (It wasn't
-originally intended for outside use, but it can be now.) (Giles
-Radford)
-
-Passing in a nonexistent encoding will no longer crash the parser on
-Python 2.4 (John Nagle).
-
-Fixed an underlying bug in SGMLParser that thinks ASCII has 255
-characters instead of 127 (John Nagle).
-
-Entities are converted more consistently to Unicode characters.
-
-Entity references in attribute values are now converted to Unicode
-characters when appropriate. Numeric entities are always converted,
-because SGMLParser always converts them outside of attribute values.
-
-ALL_ENTITIES happens to just be the XHTML entities, so I renamed it to
-XHTML_ENTITIES.
-
-The regular expression for bare ampersands was too loose. In some
-cases ampersands were not being escaped. (Sam Ruby?)
-
-Non-breaking spaces and other special Unicode space characters are no
-longer folded to ASCII spaces. (Robert Leftwich)
-
-Information inside a TEXTAREA tag is now parsed literally, not as HTML
-tags. TEXTAREA now works exactly the same way as SCRIPT. (Zephyr Fang)
-
-= 3.0.4 =
-
-Fixed a bug that crashed Unicode conversion in some cases.
-
-Fixed a bug that prevented UnicodeDammit from being used as a
-general-purpose data scrubber.
-
-Fixed some unit test failures when running against Python 2.5.
-
-When considering whether to convert smart quotes, UnicodeDammit now
-looks at the original encoding in a case-insensitive way.
-
-= 3.0.3 (20060606) =
-
-Beautiful Soup is now usable as a way to clean up invalid XML/HTML (be
-sure to pass in an appropriate value for convertEntities, or XML/HTML
-entities might stick around that aren't valid in HTML/XML). The result
-may not validate, but it should be good enough to not choke a
-real-world XML parser. Specifically, the output of a properly
-constructed soup object should always be valid as part of an XML
-document, but parts may be missing if they were missing in the
-original. As always, if the input is valid XML, the output will also
-be valid.
-
-= 3.0.2 (20060602) =
-
-Previously, Beautiful Soup correctly handled attribute values that
-contained embedded quotes (sometimes by escaping), but not other kinds
-of XML character. Now, it correctly handles or escapes all special XML
-characters in attribute values.
-
-I aliased methods to the 2.x names (fetch, find, findText, etc.) for
-backwards compatibility purposes. Those names are deprecated and if I
-ever do a 4.0 I will remove them. I will, I tell you!
-
-Fixed a bug where the findAll method wasn't passing along any keyword
-arguments.
-
-When run from the command line, Beautiful Soup now acts as an HTML
-pretty-printer, not an XML pretty-printer.
-
-= 3.0.1 (20060530) =
-
-Reintroduced the "fetch by CSS class" shortcut. I thought keyword
-arguments would replace it, but they don't. You can't call soup('a',
-class='foo') because class is a Python keyword.
-
-If Beautiful Soup encounters a meta tag that declares the encoding,
-but a SoupStrainer tells it not to parse that tag, Beautiful Soup will
-no longer try to rewrite the meta tag to mention the new
-encoding. Basically, this makes SoupStrainers work in real-world
-applications instead of crashing the parser.
-
-= 3.0.0 "Who would not give all else for two p" (20060528) =
-
-This release is not backward-compatible with previous releases. If
-you've got code written with a previous version of the library, go
-ahead and keep using it, unless one of the features mentioned here
-really makes your life easier. Since the library is self-contained,
-you can include an old copy of the library in your old applications,
-and use the new version for everything else.
-
-The documentation has been rewritten and greatly expanded with many
-more examples.
-
-Beautiful Soup autodetects the encoding of a document (or uses the one
-you specify), and converts it from its native encoding to
-Unicode. Internally, it only deals with Unicode strings. When you
-print out the document, it converts to UTF-8 (or another encoding you
-specify). [Doc reference]
-
-It's now easy to make large-scale changes to the parse tree without
-screwing up the navigation members. The methods are extract,
-replaceWith, and insert. [Doc reference. See also Improving Memory
-Usage with extract]
-
-Passing True in as an attribute value gives you tags that have any
-value for that attribute. You don't have to create a regular
-expression. Passing None for an attribute value gives you tags that
-don't have that attribute at all.
-
-Tag objects now know whether or not they're self-closing. This avoids
-the problem where Beautiful Soup thought that tags like <BR /> were
-self-closing even in XML documents. You can customize the self-closing
-tags for a parser object by passing them in as a list of
-selfClosingTags: you don't have to subclass anymore.
-
-There's a new built-in parser, MinimalSoup, which has most of
-BeautifulSoup's HTML-specific rules, but no tag nesting rules. [Doc
-reference]
-
-You can use a SoupStrainer to tell Beautiful Soup to parse only part
-of a document. This saves time and memory, often making Beautiful Soup
-about as fast as a custom-built SGMLParser subclass. [Doc reference,
-SoupStrainer reference]
-
-You can (usually) use keyword arguments instead of passing a
-dictionary of attributes to a search method. That is, you can replace
-soup(args={"id" : "5"}) with soup(id="5"). You can still use args if
-(for instance) you need to find an attribute whose name clashes with
-the name of an argument to findAll. [Doc reference: **kwargs attrs]
-
-The method names have changed to the better method names used in
-Rubyful Soup. Instead of find methods and fetch methods, there are
-only find methods. Instead of a scheme where you can't remember which
-method finds one element and which one finds them all, we have find
-and findAll. In general, if the method name mentions All or a plural
-noun (eg. findNextSiblings), then it finds many elements
-method. Otherwise, it only finds one element. [Doc reference]
-
-Some of the argument names have been renamed for clarity. For instance
-avoidParserProblems is now parserMassage.
-
-Beautiful Soup no longer implements a feed method. You need to pass a
-string or a filehandle into the soup constructor, not with feed after
-the soup has been created. There is still a feed method, but it's the
-feed method implemented by SGMLParser and calling it will bypass
-Beautiful Soup and cause problems.
-
-The NavigableText class has been renamed to NavigableString. There is
-no NavigableUnicodeString anymore, because every string inside a
-Beautiful Soup parse tree is a Unicode string.
-
-findText and fetchText are gone. Just pass a text argument into find
-or findAll.
-
-Null was more trouble than it was worth, so I got rid of it. Anything
-that used to return Null now returns None.
-
-Special XML constructs like comments and CDATA now have their own
-NavigableString subclasses, instead of being treated as oddly-formed
-data. If you parse a document that contains CDATA and write it back
-out, the CDATA will still be there.
-
-When you're parsing a document, you can get Beautiful Soup to convert
-XML or HTML entities into the corresponding Unicode characters. [Doc
-reference]
-
-= 2.1.1 (20050918) =
-
-Fixed a serious performance bug in BeautifulStoneSoup which was
-causing parsing to be incredibly slow.
-
-Corrected several entities that were previously being incorrectly
-translated from Microsoft smart-quote-like characters.
-
-Fixed a bug that was breaking text fetch.
-
-Fixed a bug that crashed the parser when text chunks that look like
-HTML tag names showed up within a SCRIPT tag.
-
-THEAD, TBODY, and TFOOT tags are now nestable within TABLE
-tags. Nested tables should parse more sensibly now.
-
-BASE is now considered a self-closing tag.
-
-= 2.1.0 "Game, or any other dish?" (20050504) =
-
-Added a wide variety of new search methods which, given a starting
-point inside the tree, follow a particular navigation member (like
-nextSibling) over and over again, looking for Tag and NavigableText
-objects that match certain criteria. The new methods are findNext,
-fetchNext, findPrevious, fetchPrevious, findNextSibling,
-fetchNextSiblings, findPreviousSibling, fetchPreviousSiblings,
-findParent, and fetchParents. All of these use the same basic code
-used by first and fetch, so you can pass your weird ways of matching
-things into these methods.
-
-The fetch method and its derivatives now accept a limit argument.
-
-You can now pass keyword arguments when calling a Tag object as though
-it were a method.
-
-Fixed a bug that caused all hand-created tags to share a single set of
-attributes.
-
-= 2.0.3 (20050501) =
-
-Fixed Python 2.2 support for iterators.
-
-Fixed a bug that gave the wrong representation to tags within quote
-tags like <script>.
-
-Took some code from Mark Pilgrim that treats CDATA declarations as
-data instead of ignoring them.
-
-Beautiful Soup's setup.py will now do an install even if the unit
-tests fail. It won't build a source distribution if the unit tests
-fail, so I can't release a new version unless they pass.
-
-= 2.0.2 (20050416) =
-
-Added the unit tests in a separate module, and packaged it with
-distutils.
-
-Fixed a bug that sometimes caused renderContents() to return a Unicode
-string even if there was no Unicode in the original string.
-
-Added the done() method, which closes all of the parser's open
-tags. It gets called automatically when you pass in some text to the
-constructor of a parser class; otherwise you must call it yourself.
-
-Reinstated some backwards compatibility with 1.x versions: referencing
-the string member of a NavigableText object returns the NavigableText
-object instead of throwing an error.
-
-= 2.0.1 (20050412) =
-
-Fixed a bug that caused bad results when you tried to reference a tag
-name shorter than 3 characters as a member of a Tag, eg. tag.table.td.
-
-Made sure all Tags have the 'hidden' attribute so that an attempt to
-access tag.hidden doesn't spawn an attempt to find a tag named
-'hidden'.
-
-Fixed a bug in the comparison operator.
-
-= 2.0.0 "Who cares for fish?" (20050410)
-
-Beautiful Soup version 1 was very useful but also pretty stupid. I
-originally wrote it without noticing any of the problems inherent in
-trying to build a parse tree out of ambiguous HTML tags. This version
-solves all of those problems to my satisfaction. It also adds many new
-clever things to make up for the removal of the stupid things.
-
-== Parsing ==
-
-The parser logic has been greatly improved, and the BeautifulSoup
-class should much more reliably yield a parse tree that looks like
-what the page author intended. For a particular class of odd edge
-cases that now causes problems, there is a new class,
-ICantBelieveItsBeautifulSoup.
-
-By default, Beautiful Soup now performs some cleanup operations on
-text before parsing it. This is to avoid common problems with bad
-definitions and self-closing tags that crash SGMLParser. You can
-provide your own set of cleanup operations, or turn it off
-altogether. The cleanup operations include fixing self-closing tags
-that don't close, and replacing Microsoft smart quotes and similar
-characters with their HTML entity equivalents.
-
-You can now get a pretty-print version of parsed HTML to get a visual
-picture of how Beautiful Soup parses it, with the Tag.prettify()
-method.
-
-== Strings and Unicode ==
-
-There are separate NavigableText subclasses for ASCII and Unicode
-strings. These classes directly subclass the corresponding base data
-types. This means you can treat NavigableText objects as strings
-instead of having to call methods on them to get the strings.
-
-str() on a Tag always returns a string, and unicode() always returns
-Unicode. Previously it was inconsistent.
-
-== Tree traversal ==
-
-In a first() or fetch() call, the tag name or the desired value of an
-attribute can now be any of the following:
-
- * A string (matches that specific tag or that specific attribute value)
- * A list of strings (matches any tag or attribute value in the list)
- * A compiled regular expression object (matches any tag or attribute
- value that matches the regular expression)
- * A callable object that takes the Tag object or attribute value as a
- string. It returns None/false/empty string if the given string
- doesn't match, and any other value if it does.
-
-This is much easier to use than SQL-style wildcards (see, regular
-expressions are good for something). Because of this, I took out
-SQL-style wildcards. I'll put them back if someone complains, but
-their removal simplifies the code a lot.
-
-You can use fetch() and first() to search for text in the parse tree,
-not just tags. There are new alias methods fetchText() and firstText()
-designed for this purpose. As with searching for tags, you can pass in
-a string, a regular expression object, or a method to match your text.
-
-If you pass in something besides a map to the attrs argument of
-fetch() or first(), Beautiful Soup will assume you want to match that
-thing against the "class" attribute. When you're scraping
-well-structured HTML, this makes your code a lot cleaner.
-
-1.x and 2.x both let you call a Tag object as a shorthand for
-fetch(). For instance, foo("bar") is a shorthand for
-foo.fetch("bar"). In 2.x, you can also access a specially-named member
-of a Tag object as a shorthand for first(). For instance, foo.barTag
-is a shorthand for foo.first("bar"). By chaining these shortcuts you
-traverse a tree in very little code: for header in
-soup.bodyTag.pTag.tableTag('th'):
-
-If an element relationship (like parent or next) doesn't apply to a
-tag, it'll now show up Null instead of None. first() will also return
-Null if you ask it for a nonexistent tag. Null is an object that's
-just like None, except you can do whatever you want to it and it'll
-give you Null instead of throwing an error.
-
-This lets you do tree traversals like soup.htmlTag.headTag.titleTag
-without having to worry if the intermediate stages are actually
-there. Previously, if there was no 'head' tag in the document, headTag
-in that instance would have been None, and accessing its 'titleTag'
-member would have thrown an AttributeError. Now, you can get what you
-want when it exists, and get Null when it doesn't, without having to
-do a lot of conditionals checking to see if every stage is None.
-
-There are two new relations between page elements: previousSibling and
-nextSibling. They reference the previous and next element at the same
-level of the parse tree. For instance, if you have HTML like this:
-
- <p><ul><li>Foo<br /><li>Bar</ul>
-
-The first 'li' tag has a previousSibling of Null and its nextSibling
-is the second 'li' tag. The second 'li' tag has a nextSibling of Null
-and its previousSibling is the first 'li' tag. The previousSibling of
-the 'ul' tag is the first 'p' tag. The nextSibling of 'Foo' is the
-'br' tag.
-
-I took out the ability to use fetch() to find tags that have a
-specific list of contents. See, I can't even explain it well. It was
-really difficult to use, I never used it, and I don't think anyone
-else ever used it. To the extent anyone did, they can probably use
-fetchText() instead. If it turns out someone needs it I'll think of
-another solution.
-
-== Tree manipulation ==
-
-You can add new attributes to a tag, and delete attributes from a
-tag. In 1.x you could only change a tag's existing attributes.
-
-== Porting Considerations ==
-
-There are three changes in 2.0 that break old code:
-
-In the post-1.2 release you could pass in a function into fetch(). The
-function took a string, the tag name. In 2.0, the function takes the
-actual Tag object.
-
-It's no longer to pass in SQL-style wildcards to fetch(). Use a
-regular expression instead.
-
-The different parsing algorithm means the parse tree may not be shaped
-like you expect. This will only actually affect you if your code uses
-one of the affected parts. I haven't run into this problem yet while
-porting my code.
-
-= Between 1.2 and 2.0 =
-
-This is the release to get if you want Python 1.5 compatibility.
-
-The desired value of an attribute can now be any of the following:
-
- * A string
- * A string with SQL-style wildcards
- * A compiled RE object
- * A callable that returns None/false/empty string if the given value
- doesn't match, and any other value otherwise.
-
-This is much easier to use than SQL-style wildcards (see, regular
-expressions are good for something). Because of this, I no longer
-recommend you use SQL-style wildcards. They may go away in a future
-release to clean up the code.
-
-Made Beautiful Soup handle processing instructions as text instead of
-ignoring them.
-
-Applied patch from Richie Hindle (richie at entrian dot com) that
-makes tag.string a shorthand for tag.contents[0].string when the tag
-has only one string-owning child.
-
-Added still more nestable tags. The nestable tags thing won't work in
-a lot of cases and needs to be rethought.
-
-Fixed an edge case where searching for "%foo" would match any string
-shorter than "foo".
-
-= 1.2 "Who for such dainties would not stoop?" (20040708) =
-
-Applied patch from Ben Last (ben at benlast dot com) that made
-Tag.renderContents() correctly handle Unicode.
-
-Made BeautifulStoneSoup even dumber by making it not implicitly close
-a tag when another tag of the same type is encountered; only when an
-actual closing tag is encountered. This change courtesy of Fuzzy (mike
-at pcblokes dot com). BeautifulSoup still works as before.
-
-= 1.1 "Swimming in a hot tureen" =
-
-Added more 'nestable' tags. Changed popping semantics so that when a
-nestable tag is encountered, tags are popped up to the previously
-encountered nestable tag (of whatever kind). I will revert this if
-enough people complain, but it should make more people's lives easier
-than harder. This enhancement was suggested by Anthony Baxter (anthony
-at interlink dot com dot au).
-
-= 1.0 "So rich and green" (20040420) =
-
-Initial release.
diff --git a/libs/bs4/README.txt b/libs/bs4/README.txt
deleted file mode 100644
index 305c51e05..000000000
--- a/libs/bs4/README.txt
+++ /dev/null
@@ -1,63 +0,0 @@
-= Introduction =
-
- >>> from bs4 import BeautifulSoup
- >>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
- >>> print soup.prettify()
- <html>
- <body>
- <p>
- Some
- <b>
- bad
- <i>
- HTML
- </i>
- </b>
- </p>
- </body>
- </html>
- >>> soup.find(text="bad")
- u'bad'
-
- >>> soup.i
- <i>HTML</i>
-
- >>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
- >>> print soup.prettify()
- <?xml version="1.0" encoding="utf-8">
- <tag1>
- Some
- <tag2 />
- bad
- <tag3>
- XML
- </tag3>
- </tag1>
-
-= Full documentation =
-
-The bs4/doc/ directory contains full documentation in Sphinx
-format. Run "make html" in that directory to create HTML
-documentation.
-
-= Running the unit tests =
-
-Beautiful Soup supports unit test discovery from the project root directory:
-
- $ nosetests
-
- $ python -m unittest discover -s bs4 # Python 2.7 and up
-
-If you checked out the source tree, you should see a script in the
-home directory called test-all-versions. This script will run the unit
-tests under Python 2.7, then create a temporary Python 3 conversion of
-the source and run the unit tests again under Python 3.
-
-= Links =
-
-Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
-Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
- http://readthedocs.org/docs/beautiful-soup-4/
-Discussion group: http://groups.google.com/group/beautifulsoup/
-Development: https://code.launchpad.net/beautifulsoup/
-Bug tracker: https://bugs.launchpad.net/beautifulsoup/
diff --git a/libs/bs4/TODO.txt b/libs/bs4/TODO.txt
deleted file mode 100644
index e26d6264d..000000000
--- a/libs/bs4/TODO.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-Additions
----------
-
-More of the jQuery API: nextUntil?
-
-Optimizations
--------------
-
-The html5lib tree builder doesn't use the standard tree-building API,
-which worries me and has resulted in a number of bugs.
-
-markup_attr_map can be optimized since it's always a map now.
-
-Upon encountering UTF-16LE data or some other uncommon serialization
-of Unicode, UnicodeDammit will convert the data to Unicode, then
-encode it at UTF-8. This is wasteful because it will just get decoded
-back to Unicode.
-
-CDATA
------
-
-The elementtree XMLParser has a strip_cdata argument that, when set to
-False, should allow Beautiful Soup to preserve CDATA sections instead
-of treating them as text. Except it doesn't. (This argument is also
-present for HTMLParser, and also does nothing there.)
-
-Currently, htm5lib converts CDATA sections into comments. An
-as-yet-unreleased version of html5lib changes the parser's handling of
-CDATA sections to allow CDATA sections in tags like <svg> and
-<math>. The HTML5TreeBuilder will need to be updated to create CData
-objects instead of Comment objects in this situation.
diff --git a/libs/bs4/__init__.py b/libs/bs4/__init__.py
index 7a80452f7..95ca229c1 100644
--- a/libs/bs4/__init__.py
+++ b/libs/bs4/__init__.py
@@ -17,18 +17,17 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
__author__ = "Leonard Richardson ([email protected])"
-__version__ = "4.6.0"
-__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
+__version__ = "4.8.0"
+__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
+# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import os
import re
+import sys
import traceback
import warnings
@@ -50,7 +49,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
-'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
"""
@@ -74,7 +73,7 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then
handle_endtag.
"""
- ROOT_TAG_NAME = u'[document]'
+ ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
@@ -82,16 +81,56 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
- NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
+ NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
- """The Soup object is initialized as the 'root tag', and the
- provided markup (which can be a string or a file-like object)
- is fed into the underlying parser."""
+ """Constructor.
+
+ :param markup: A string or a file-like object representing
+ markup to be parsed.
+
+ :param features: Desirable features of the parser to be used. This
+ may be the name of a specific parser ("lxml", "lxml-xml",
+ "html.parser", or "html5lib") or it may be the type of markup
+ to be used ("html", "html5", "xml"). It's recommended that you
+ name a specific parser, so that Beautiful Soup gives you the
+ same results across platforms and virtual environments.
+
+ :param builder: A TreeBuilder subclass to instantiate (or
+ instance to use) instead of looking one up based on
+ `features`. You only need to use this if you've implemented a
+ custom TreeBuilder.
+
+ :param parse_only: A SoupStrainer. Only parts of the document
+ matching the SoupStrainer will be considered. This is useful
+ when parsing part of a document that would otherwise be too
+ large to fit into memory.
+
+ :param from_encoding: A string indicating the encoding of the
+ document to be parsed. Pass this in if Beautiful Soup is
+ guessing wrongly about the document's encoding.
+
+ :param exclude_encodings: A list of strings indicating
+ encodings known to be wrong. Pass this in if you don't know
+ the document's encoding but you know Beautiful Soup's guess is
+ wrong.
+
+ :param kwargs: For backwards compatibility purposes, the
+ constructor accepts certain keyword arguments used in
+ Beautiful Soup 3. None of these arguments do anything in
+ Beautiful Soup 4; they will result in a warning and then be ignored.
+
+ Apart from this, any keyword arguments passed into the BeautifulSoup
+ constructor are propagated to the TreeBuilder constructor. This
+ makes it possible to configure a TreeBuilder beyond saying
+ which one to use.
+
+ """
if 'convertEntities' in kwargs:
+ del kwargs['convertEntities']
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
@@ -142,18 +181,22 @@ class BeautifulSoup(Tag):
from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding")
- if from_encoding and isinstance(markup, unicode):
+ if from_encoding and isinstance(markup, str):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
- if len(kwargs) > 0:
- arg = kwargs.keys().pop()
- raise TypeError(
- "__init__() got an unexpected keyword argument '%s'" % arg)
-
- if builder is None:
- original_features = features
- if isinstance(features, basestring):
+ # We need this information to track whether or not the builder
+ # was specified well enough that we can omit the 'you need to
+ # specify a parser' warning.
+ original_builder = builder
+ original_features = features
+
+ if isinstance(builder, type):
+ # A builder class was passed in; it needs to be instantiated.
+ builder_class = builder
+ builder = None
+ elif builder is None:
+ if isinstance(features, str):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
@@ -163,41 +206,73 @@ class BeautifulSoup(Tag):
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
- builder = builder_class()
- if not (original_features == builder.NAME or
- original_features in builder.ALTERNATE_NAMES):
+
+ # At this point either we have a TreeBuilder instance in
+ # builder, or we have a builder_class that we can instantiate
+ # with the remaining **kwargs.
+ if builder is None:
+ builder = builder_class(**kwargs)
+ if not original_builder and not (
+ original_features == builder.NAME or
+ original_features in builder.ALTERNATE_NAMES
+ ):
if builder.is_xml:
markup_type = "XML"
else:
markup_type = "HTML"
- caller = traceback.extract_stack()[0]
- filename = caller[0]
- line_number = caller[1]
- warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
- filename=filename,
- line_number=line_number,
- parser=builder.NAME,
- markup_type=markup_type))
-
+ # This code adapted from warnings.py so that we get the same line
+ # of code as our warnings.warn() call gets, even if the answer is wrong
+ # (as it may be in a multithreading situation).
+ caller = None
+ try:
+ caller = sys._getframe(1)
+ except ValueError:
+ pass
+ if caller:
+ globals = caller.f_globals
+ line_number = caller.f_lineno
+ else:
+ globals = sys.__dict__
+ line_number= 1
+ filename = globals.get('__file__')
+ if filename:
+ fnl = filename.lower()
+ if fnl.endswith((".pyc", ".pyo")):
+ filename = filename[:-1]
+ if filename:
+ # If there is no filename at all, the user is most likely in a REPL,
+ # and the warning is not necessary.
+ values = dict(
+ filename=filename,
+ line_number=line_number,
+ parser=builder.NAME,
+ markup_type=markup_type
+ )
+ warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
+ else:
+ if kwargs:
+ warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
+
self.builder = builder
self.is_xml = builder.is_xml
self.known_xml = self.is_xml
- self.builder.soup = self
-
+ self._namespaces = dict()
self.parse_only = parse_only
+ self.builder.initialize_soup(self)
+
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
elif len(markup) <= 256 and (
(isinstance(markup, bytes) and not b'<' in markup)
- or (isinstance(markup, unicode) and not u'<' in markup)
+ or (isinstance(markup, str) and not '<' in markup)
):
# Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
- if (isinstance(markup, unicode)
+ if (isinstance(markup, str)
and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
@@ -205,13 +280,13 @@ class BeautifulSoup(Tag):
is_file = False
try:
is_file = os.path.exists(possible_filename)
- except Exception, e:
+ except Exception as e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
pass
if is_file:
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
markup = markup.encode("utf8")
warnings.warn(
'"%s" looks like a filename, not markup. You should'
@@ -263,9 +338,9 @@ class BeautifulSoup(Tag):
if isinstance(markup, bytes):
space = b' '
cant_start_with = (b"http:", b"https:")
- elif isinstance(markup, unicode):
- space = u' '
- cant_start_with = (u"http:", u"https:")
+ elif isinstance(markup, str):
+ space = ' '
+ cant_start_with = ("http:", "https:")
else:
return
@@ -302,9 +377,10 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = []
self.pushTag(self)
- def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+ def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
"""Create a new tag associated with this soup."""
- return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+ kwattrs.update(attrs)
+ return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
@@ -327,7 +403,7 @@ class BeautifulSoup(Tag):
def pushTag(self, tag):
#print "Push", tag.name
- if self.currentTag:
+ if self.currentTag is not None:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
@@ -336,7 +412,7 @@ class BeautifulSoup(Tag):
def endData(self, containerClass=NavigableString):
if self.current_data:
- current_data = u''.join(self.current_data)
+ current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space
# or newline.
@@ -366,60 +442,71 @@ class BeautifulSoup(Tag):
def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
- parent = parent or self.currentTag
- previous_element = most_recent_element or self._most_recent_element
+ if parent is None:
+ parent = self.currentTag
+ if most_recent_element is not None:
+ previous_element = most_recent_element
+ else:
+ previous_element = self._most_recent_element
next_element = previous_sibling = next_sibling = None
if isinstance(o, Tag):
next_element = o.next_element
next_sibling = o.next_sibling
previous_sibling = o.previous_sibling
- if not previous_element:
+ if previous_element is None:
previous_element = o.previous_element
+ fix = parent.next_element is not None
+
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
self._most_recent_element = o
parent.contents.append(o)
- if parent.next_sibling:
- # This node is being inserted into an element that has
- # already been parsed. Deal with any dangling references.
- index = len(parent.contents)-1
- while index >= 0:
- if parent.contents[index] is o:
- break
- index -= 1
- else:
- raise ValueError(
- "Error building tree: supposedly %r was inserted "
- "into %r after the fact, but I don't see it!" % (
- o, parent
- )
- )
- if index == 0:
- previous_element = parent
- previous_sibling = None
- else:
- previous_element = previous_sibling = parent.contents[index-1]
- if index == len(parent.contents)-1:
- next_element = parent.next_sibling
- next_sibling = None
- else:
- next_element = next_sibling = parent.contents[index+1]
-
- o.previous_element = previous_element
- if previous_element:
- previous_element.next_element = o
- o.next_element = next_element
- if next_element:
- next_element.previous_element = o
- o.next_sibling = next_sibling
- if next_sibling:
- next_sibling.previous_sibling = o
- o.previous_sibling = previous_sibling
- if previous_sibling:
- previous_sibling.next_sibling = o
+ # Check if we are inserting into an already parsed node.
+ if fix:
+ self._linkage_fixer(parent)
+
+ def _linkage_fixer(self, el):
+ """Make sure linkage of this fragment is sound."""
+
+ first = el.contents[0]
+ child = el.contents[-1]
+ descendant = child
+
+ if child is first and el.parent is not None:
+ # Parent should be linked to first child
+ el.next_element = child
+ # We are no longer linked to whatever this element is
+ prev_el = child.previous_element
+ if prev_el is not None and prev_el is not el:
+ prev_el.next_element = None
+ # First child should be linked to the parent, and no previous siblings.
+ child.previous_element = el
+ child.previous_sibling = None
+
+ # We have no sibling as we've been appended as the last.
+ child.next_sibling = None
+
+ # This index is a tag, dig deeper for a "last descendant"
+ if isinstance(child, Tag) and child.contents:
+ descendant = child._last_descendant(False)
+
+ # As the final step, link last descendant. It should be linked
+ # to the parent's next sibling (if found), else walk up the chain
+ # and find a parent with a sibling. It should have no next sibling.
+ descendant.next_element = None
+ descendant.next_sibling = None
+ target = el
+ while True:
+ if target is None:
+ break
+ elif target.next_sibling is not None:
+ descendant.next_element = target.next_sibling
+ target.next_sibling.previous_element = child
+ break
+ target = target.parent
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
@@ -465,7 +552,7 @@ class BeautifulSoup(Tag):
self.currentTag, self._most_recent_element)
if tag is None:
return tag
- if self._most_recent_element:
+ if self._most_recent_element is not None:
self._most_recent_element.next_element = tag
self._most_recent_element = tag
self.pushTag(tag)
@@ -490,9 +577,9 @@ class BeautifulSoup(Tag):
encoding_part = ''
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
- prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
+ prefix = '<?xml version="1.0"%s?>\n' % encoding_part
else:
- prefix = u''
+ prefix = ''
if not pretty_print:
indent_level = None
else:
@@ -526,4 +613,4 @@ class FeatureNotFound(ValueError):
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
- print soup.prettify()
+ print(soup.prettify())
diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py
index fdb3362fc..cc497cf0b 100644
--- a/libs/bs4/builder/__init__.py
+++ b/libs/bs4/builder/__init__.py
@@ -1,5 +1,5 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
from collections import defaultdict
import itertools
@@ -7,8 +7,7 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
- HTMLAwareEntitySubstitution,
- whitespace_re
+ nonwhitespace_re
)
__all__ = [
@@ -90,18 +89,46 @@ class TreeBuilder(object):
is_xml = False
picklable = False
- preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
-
+
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
- cdata_list_attributes = {}
-
-
- def __init__(self):
+ DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+
+ DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
+ USE_DEFAULT = object()
+
+ def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
+ """Constructor.
+
+ :param multi_valued_attributes: If this is set to None, the
+ TreeBuilder will not turn any values for attributes like
+ 'class' into lists. Setting this do a dictionary will
+ customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+ for an example.
+
+ Internally, these are called "CDATA list attributes", but that
+ probably doesn't make sense to an end-user, so the argument name
+ is `multi_valued_attributes`.
+
+ :param preserve_whitespace_tags:
+ """
self.soup = None
-
+ if multi_valued_attributes is self.USE_DEFAULT:
+ multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
+ self.cdata_list_attributes = multi_valued_attributes
+ if preserve_whitespace_tags is self.USE_DEFAULT:
+ preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
+ self.preserve_whitespace_tags = preserve_whitespace_tags
+
+ def initialize_soup(self, soup):
+ """The BeautifulSoup object has been initialized and is now
+ being associated with the TreeBuilder.
+ """
+ self.soup = soup
+
def reset(self):
pass
@@ -125,7 +152,7 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
-
+
def feed(self, markup):
raise NotImplementedError()
@@ -160,14 +187,14 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None)
- for attr in attrs.keys():
+ for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string
# value is a whitespace-separated list of
# values. Split it into a list.
value = attrs[attr]
- if isinstance(value, basestring):
- values = whitespace_re.split(value)
+ if isinstance(value, str):
+ values = nonwhitespace_re.findall(value)
else:
# html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse
@@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
- preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
-
- # These are from HTML4, removed in HTML5.
- 'spacer', 'frame'
+
+ # These are from earlier versions of HTML and are removed in HTML5.
+ 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
])
+ # The HTML standard defines these as block-level elements. Beautiful
+ # Soup does not treat these elements differently from other elements,
+ # but it may do so eventually, and this information is available if
+ # you need to use it.
+ block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
@@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
- cdata_list_attributes = {
+ DEFAULT_CDATA_LIST_ATTRIBUTES = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
@@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"],
}
+ DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py
index 5f5489358..090bb61a8 100644
--- a/libs/bs4/builder/_html5lib.py
+++ b/libs/bs4/builder/_html5lib.py
@@ -1,5 +1,5 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
__all__ = [
'HTML5TreeBuilder',
@@ -15,7 +15,7 @@ from bs4.builder import (
)
from bs4.element import (
NamespacedAttribute,
- whitespace_re,
+ nonwhitespace_re,
)
import html5lib
from html5lib.constants import (
@@ -33,7 +33,7 @@ try:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False
-except ImportError, e:
+except ImportError as e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
@@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
extra_kwargs = dict()
- if not isinstance(markup, unicode):
+ if not isinstance(markup, str):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
@@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
- if not isinstance(original_encoding, basestring):
+ if not isinstance(original_encoding, str):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
@@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'<html><head></head><body>%s</body></html>' % fragment
+ return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
@@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
- for name, value in element.attrs.items():
+ for name, value in list(element.attrs.items()):
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
@@ -199,14 +199,14 @@ class AttrList(object):
def __setitem__(self, name, value):
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
- list_attr = HTML5TreeBuilder.cdata_list_attributes
+ list_attr = self.element.cdata_list_attributes
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):
- value = whitespace_re.split(value)
+ value = nonwhitespace_re.findall(value)
self.element[name] = value
def items(self):
return list(self.attrs.items())
@@ -229,7 +229,7 @@ class Element(treebuilder_base.Node):
def appendChild(self, node):
string_child = child = None
- if isinstance(node, basestring):
+ if isinstance(node, str):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
@@ -246,10 +246,10 @@ class Element(treebuilder_base.Node):
child = node.element
node.parent = self
- if not isinstance(child, basestring) and child.parent is not None:
+ if not isinstance(child, str) and child.parent is not None:
node.element.extract()
- if (string_child and self.element.contents
+ if (string_child is not None and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
@@ -259,7 +259,7 @@ class Element(treebuilder_base.Node):
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
- if isinstance(node, basestring):
+ if isinstance(node, str):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
@@ -299,7 +299,7 @@ class Element(treebuilder_base.Node):
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
- for name, value in attributes.items():
+ for name, value in list(attributes.items()):
self.element[name] = value
# The attributes may contain variables that need substitution.
@@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append[0]
- if new_parents_last_descendant:
+ if new_parents_last_descendant is not None:
first_child.previous_element = new_parents_last_descendant
else:
first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child
- if new_parents_last_descendant:
+ if new_parents_last_descendant is not None:
new_parents_last_descendant.next_element = first_child
else:
new_parent_element.next_element = first_child
- if new_parents_last_child:
+ if new_parents_last_child is not None:
new_parents_last_child.next_sibling = first_child
# Find the very last element being moved. It is now the
@@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
- if new_parents_last_descendant_next_element:
+ if new_parents_last_descendant_next_element is not None:
# TODO: This code has no test coverage and I'm not sure
# how to get html5lib to go through this path, but it's
# just the other side of the previous line.
diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py
index 67890b3a3..ea549c356 100644
--- a/libs/bs4/builder/_htmlparser.py
+++ b/libs/bs4/builder/_htmlparser.py
@@ -1,17 +1,18 @@
+# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
__all__ = [
'HTMLParserTreeBuilder',
]
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
try:
- from HTMLParser import HTMLParseError
-except ImportError, e:
+ from html.parser import HTMLParseError
+except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
@@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
-
+
+ def error(self, msg):
+ """In Python 3, HTMLParser subclasses must implement error(), although this
+ requirement doesn't appear to be documented.
+
+ In Python 2, HTMLParser implements error() as raising an exception.
+
+ In any event, this method is called only on very strange markup and our best strategy
+ is to pretend it didn't happen and keep going.
+ """
+ warnings.warn(msg)
+
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
@@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else:
real_name = int(name)
- try:
- data = unichr(real_name)
- except (ValueError, OverflowError), e:
- data = u"\N{REPLACEMENT CHARACTER}"
-
+ data = None
+ if real_name < 256:
+ # HTML numeric entities are supposed to reference Unicode
+ # code points, but sometimes they reference code points in
+ # some other encoding (ahem, Windows-1252). E.g. &#147;
+ # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+ # code tries to detect this situation and compensate.
+ for encoding in (self.soup.original_encoding, 'windows-1252'):
+ if not encoding:
+ continue
+ try:
+ data = bytearray([real_name]).decode(encoding)
+ except UnicodeDecodeError as e:
+ pass
+ if not data:
+ try:
+ data = chr(real_name)
+ except (ValueError, OverflowError) as e:
+ pass
+ data = data or "\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
@@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None:
data = character
else:
- data = "&%s;" % name
+ # If this were XML, it would be ambiguous whether "&foo"
+ # was an character entity reference with a missing
+ # semicolon or the literal string "&foo". Since this is
+ # HTML, we have a complete list of all character entity references,
+ # and this one wasn't found, so assume it's the literal string "&foo".
+ data = "&%s" % name
self.handle_data(data)
def handle_comment(self, data):
@@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
- def __init__(self, *args, **kwargs):
+ def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+ super(HTMLParserTreeBuilder, self).__init__(**kwargs)
+ parser_args = parser_args or []
+ parser_kwargs = parser_kwargs or {}
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
- kwargs['strict'] = False
+ parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
- kwargs['convert_charrefs'] = False
- self.parser_args = (args, kwargs)
+ parser_kwargs['convert_charrefs'] = False
+ self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
@@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
yield (markup, None, None, False)
return
@@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
- except HTMLParseError, e:
+ parser.close()
+ except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py
index d2ca2872d..a490e2301 100644
--- a/libs/bs4/builder/_lxml.py
+++ b/libs/bs4/builder/_lxml.py
@@ -1,13 +1,18 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
]
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError as e:
+ from collections import Callable
+
from io import BytesIO
-from StringIO import StringIO
-import collections
+from io import StringIO
from lxml import etree
from bs4.element import (
Comment,
@@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector
LXML = 'lxml'
+def _invert(d):
+ "Invert a dictionary."
+ return dict((v,k) for k, v in list(d.items()))
+
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
@@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# This namespace mapping is specified in the XML Namespace
# standard.
- DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+ DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
+
+ DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
+
+ def initialize_soup(self, soup):
+ """Let the BeautifulSoup object know about the standard namespace
+ mapping.
+ """
+ super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+ self._register_namespaces(self.DEFAULT_NSMAPS)
+
+ def _register_namespaces(self, mapping):
+ """Let the BeautifulSoup object know about namespaces encountered
+ while parsing the document.
+
+ This might be useful later on when creating CSS selectors.
+ """
+ for key, value in list(mapping.items()):
+ if key and key not in self.soup._namespaces:
+ # Let the BeautifulSoup object know about a new namespace.
+ # If there are multiple namespaces defined with the same
+ # prefix, the first one in the document takes precedence.
+ self.soup._namespaces[key] = value
def default_parser(self, encoding):
# This can either return a parser object or a class, which
@@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser.
parser = self.default_parser(encoding)
- if isinstance(parser, collections.Callable):
+ if isinstance(parser, Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
- def __init__(self, parser=None, empty_element_tags=None):
+ def __init__(self, parser=None, empty_element_tags=None, **kwargs):
# TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new
# parsers for different encodings.
@@ -71,8 +102,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
self.soup = None
- self.nsmaps = [self.DEFAULT_NSMAPS]
-
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+ super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
+
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
@@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
else:
self.processing_instruction_class = XMLProcessingInstruction
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
@@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
- elif isinstance(markup, unicode):
+ elif isinstance(markup, str):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
@@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
def close(self):
- self.nsmaps = [self.DEFAULT_NSMAPS]
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
- if len(self.nsmaps) > 1:
- # There are no new namespaces for this tag, but
- # non-default namespaces are in play, so we need a
- # separate tag stack to know when they end.
- self.nsmaps.append(None)
+ if len(nsmap) == 0 and len(self.nsmaps) > 1:
+ # There are no new namespaces for this tag, but
+ # non-default namespaces are in play, so we need a
+ # separate tag stack to know when they end.
+ self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
- inverted_nsmap = dict((value, key) for key, value in nsmap.items())
- self.nsmaps.append(inverted_nsmap)
+
+ # First, Let the BeautifulSoup object know about it.
+ self._register_namespaces(nsmap)
+
+ # Then, add it to our running list of inverted namespace
+ # mappings.
+ self.nsmaps.append(_invert(nsmap))
+
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
- for prefix, namespace in nsmap.items():
+ for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
@@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
- for attr, value in attrs.items():
+ for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
@@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+ return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'<html><body>%s</body></html>' % fragment
+ return '<html><body>%s</body></html>' % fragment
diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py
index 7965565f5..c7ac4d431 100644
--- a/libs/bs4/dammit.py
+++ b/libs/bs4/dammit.py
@@ -6,12 +6,11 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import codecs
-from htmlentitydefs import codepoint2name
+from html.entities import codepoint2name
import re
import logging
import string
@@ -46,9 +45,9 @@ except ImportError:
pass
xml_encoding_re = re.compile(
- '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+ '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
html_meta_re = re.compile(
- '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+ '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
@@ -58,15 +57,24 @@ class EntitySubstitution(object):
lookup = {}
reverse_lookup = {}
characters_for_re = []
- for codepoint, name in list(codepoint2name.items()):
- character = unichr(codepoint)
- if codepoint != 34:
+
+ # &apos is an XHTML entity and an HTML 5, but not an HTML 4
+ # entity. We don't want to use it, but we want to recognize it on the way in.
+ #
+ # TODO: Ideally we would be able to recognize all HTML 5 named
+ # entities, but that's a little tricky.
+ extra = [(39, 'apos')]
+ for codepoint, name in list(codepoint2name.items()) + extra:
+ character = chr(codepoint)
+ if codepoint not in (34, 39):
# There's no point in turning the quotation mark into
- # &quot;, unless it happens within an attribute value, which
- # is handled elsewhere.
+ # &quot; or the single quote into &apos;, unless it
+ # happens within an attribute value, which is handled
+ # elsewhere.
characters_for_re.append(character)
lookup[character] = name
- # But we do want to turn &quot; into the quotation mark.
+ # But we do want to recognize those entities on the way in and
+ # convert them to Unicode characters.
reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
@@ -82,7 +90,7 @@ class EntitySubstitution(object):
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
- "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@@ -274,7 +282,7 @@ class EncodingDetector:
def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None
- if isinstance(data, unicode):
+ if isinstance(data, str):
# Unicode data cannot have a byte-order mark.
return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
@@ -352,9 +360,9 @@ class UnicodeDammit:
markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with.
- if isinstance(markup, unicode) or markup == '':
+ if isinstance(markup, str) or markup == '':
self.markup = markup
- self.unicode_markup = unicode(markup)
+ self.unicode_markup = str(markup)
self.original_encoding = None
return
@@ -438,7 +446,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
- return unicode(data, encoding, errors)
+ return str(data, encoding, errors)
@property
def declared_html_encoding(self):
diff --git a/libs/bs4/diagnose.py b/libs/bs4/diagnose.py
index 8768332f5..b5f6e6c8b 100644
--- a/libs/bs4/diagnose.py
+++ b/libs/bs4/diagnose.py
@@ -1,12 +1,11 @@
"""Diagnostic functions, mainly for use when doing tech support."""
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import cProfile
-from StringIO import StringIO
-from HTMLParser import HTMLParser
+from io import StringIO
+from html.parser import HTMLParser
import bs4
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
@@ -22,8 +21,8 @@ import cProfile
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
- print "Diagnostic running on Beautiful Soup %s" % __version__
- print "Python version %s" % sys.version
+ print("Diagnostic running on Beautiful Soup %s" % __version__)
+ print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
@@ -32,16 +31,16 @@ def diagnose(data):
break
else:
basic_parsers.remove(name)
- print (
+ print((
"I noticed that %s is not installed. Installing it may help." %
- name)
+ name))
if 'lxml' in basic_parsers:
- basic_parsers.append(["lxml", "xml"])
+ basic_parsers.append("lxml-xml")
try:
from lxml import etree
- print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
- except ImportError, e:
+ print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
+ except ImportError as e:
print (
"lxml is not installed or couldn't be imported.")
@@ -49,37 +48,43 @@ def diagnose(data):
if 'html5lib' in basic_parsers:
try:
import html5lib
- print "Found html5lib version %s" % html5lib.__version__
- except ImportError, e:
+ print("Found html5lib version %s" % html5lib.__version__)
+ except ImportError as e:
print (
"html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'):
data = data.read()
- elif os.path.exists(data):
- print '"%s" looks like a filename. Reading data from the file.' % data
- with open(data) as fp:
- data = fp.read()
elif data.startswith("http:") or data.startswith("https:"):
- print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
- print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+ print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
+ print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
- print
+ else:
+ try:
+ if os.path.exists(data):
+ print('"%s" looks like a filename. Reading data from the file.' % data)
+ with open(data) as fp:
+ data = fp.read()
+ except ValueError:
+ # This can happen on some platforms when the 'filename' is
+ # too long. Assume it's data and not a filename.
+ pass
+ print()
for parser in basic_parsers:
- print "Trying to parse your markup with %s" % parser
+ print("Trying to parse your markup with %s" % parser)
success = False
try:
- soup = BeautifulSoup(data, parser)
+ soup = BeautifulSoup(data, features=parser)
success = True
- except Exception, e:
- print "%s could not parse the markup." % parser
+ except Exception as e:
+ print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
- print "Here's what %s did with the markup:" % parser
- print soup.prettify()
+ print("Here's what %s did with the markup:" % parser)
+ print(soup.prettify())
- print "-" * 80
+ print("-" * 80)
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
@@ -89,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
- print("%s, %4s, %s" % (event, element.tag, element.text))
+ print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
@@ -149,7 +154,7 @@ def rword(length=5):
def rsentence(length=4):
"Generate a random sentence-like string."
- return " ".join(rword(random.randint(4,9)) for i in range(length))
+ return " ".join(rword(random.randint(4,9)) for i in list(range(length)))
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
@@ -171,9 +176,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
- print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+ print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
- print "Generated a large invalid HTML document (%d bytes)." % len(data)
+ print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
@@ -182,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
- except Exception, e:
- print "%s could not parse the markup." % parser
+ except Exception as e:
+ print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
- print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+ print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
- print "Raw lxml parsed the markup in %.2fs." % (b-a)
+ print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
- print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+ print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"):
diff --git a/libs/bs4/element.py b/libs/bs4/element.py
index 9ef75f814..f16b1663e 100644
--- a/libs/bs4/element.py
+++ b/libs/bs4/element.py
@@ -1,18 +1,35 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
__license__ = "MIT"
-import collections
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError as e:
+ from collections import Callable
import re
-import shlex
import sys
import warnings
-from bs4.dammit import EntitySubstitution
+try:
+ import soupsieve
+except ImportError as e:
+ soupsieve = None
+ warnings.warn(
+ 'The soupsieve package is not installed. CSS selectors cannot be used.'
+ )
+
+from bs4.formatter import (
+ Formatter,
+ HTMLFormatter,
+ XMLFormatter,
+)
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
-whitespace_re = re.compile("\s+")
+nonwhitespace_re = re.compile(r"\S+")
+
+# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
+# the off chance someone imported it for their own use.
+whitespace_re = re.compile(r"\s+")
def _alias(attr):
"""Alias one attribute name to another for backward compatibility"""
@@ -26,22 +43,22 @@ def _alias(attr):
return alias
-class NamespacedAttribute(unicode):
+class NamespacedAttribute(str):
def __new__(cls, prefix, name, namespace=None):
if name is None:
- obj = unicode.__new__(cls, prefix)
+ obj = str.__new__(cls, prefix)
elif prefix is None:
# Not really namespaced.
- obj = unicode.__new__(cls, name)
+ obj = str.__new__(cls, name)
else:
- obj = unicode.__new__(cls, prefix + ":" + name)
+ obj = str.__new__(cls, prefix + ":" + name)
obj.prefix = prefix
obj.name = name
obj.namespace = namespace
return obj
-class AttributeValueWithCharsetSubstitution(unicode):
+class AttributeValueWithCharsetSubstitution(str):
"""A stand-in object for a character encoding specified in HTML."""
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@@ -52,7 +69,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
"""
def __new__(cls, original_value):
- obj = unicode.__new__(cls, original_value)
+ obj = str.__new__(cls, original_value)
obj.original_value = original_value
return obj
@@ -69,15 +86,15 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
The value of the 'content' attribute will be one of these objects.
"""
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+ CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
def __new__(cls, original_value):
match = cls.CHARSET_RE.search(original_value)
if match is None:
# No substitution necessary.
- return unicode.__new__(unicode, original_value)
+ return str.__new__(str, original_value)
- obj = unicode.__new__(cls, original_value)
+ obj = str.__new__(cls, original_value)
obj.original_value = original_value
return obj
@@ -86,94 +103,71 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
-class HTMLAwareEntitySubstitution(EntitySubstitution):
-
- """Entity substitution rules that are aware of some HTML quirks.
-
- Specifically, the contents of <script> and <style> tags should not
- undergo entity substitution.
-
- Incoming NavigableString objects are checked to see if they're the
- direct children of a <script> or <style> tag.
- """
-
- cdata_containing_tags = set(["script", "style"])
-
- preformatted_tags = set(["pre"])
+
+class PageElement(object):
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=None, previous_element=None, next_element=None,
+ previous_sibling=None, next_sibling=None):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
- preserve_whitespace_tags = set(['pre', 'textarea'])
+ self.previous_element = previous_element
+ if previous_element is not None:
+ self.previous_element.next_element = self
- @classmethod
- def _substitute_if_appropriate(cls, ns, f):
- if (isinstance(ns, NavigableString)
- and ns.parent is not None
- and ns.parent.name in cls.cdata_containing_tags):
- # Do nothing.
- return ns
- # Substitute.
- return f(ns)
+ self.next_element = next_element
+ if self.next_element is not None:
+ self.next_element.previous_element = self
- @classmethod
- def substitute_html(cls, ns):
- return cls._substitute_if_appropriate(
- ns, EntitySubstitution.substitute_html)
+ self.next_sibling = next_sibling
+ if self.next_sibling is not None:
+ self.next_sibling.previous_sibling = self
- @classmethod
- def substitute_xml(cls, ns):
- return cls._substitute_if_appropriate(
- ns, EntitySubstitution.substitute_xml)
+ if (previous_sibling is None
+ and self.parent is not None and self.parent.contents):
+ previous_sibling = self.parent.contents[-1]
-class PageElement(object):
- """Contains the navigational information for some part of the page
- (either a tag or a piece of text)"""
+ self.previous_sibling = previous_sibling
+ if previous_sibling is not None:
+ self.previous_sibling.next_sibling = self
- # There are five possible values for the "formatter" argument passed in
- # to methods like encode() and prettify():
- #
- # "html" - All Unicode characters with corresponding HTML entities
- # are converted to those entities on output.
- # "minimal" - Bare ampersands and angle brackets are converted to
- # XML entities: &amp; &lt; &gt;
- # None - The null formatter. Unicode characters are never
- # converted to entities. This is not recommended, but it's
- # faster than "minimal".
- # A function - This function will be called on every string that
- # needs to undergo entity substitution.
- #
-
- # In an HTML document, the default "html" and "minimal" functions
- # will leave the contents of <script> and <style> tags alone. For
- # an XML document, all tags will be given the same treatment.
-
- HTML_FORMATTERS = {
- "html" : HTMLAwareEntitySubstitution.substitute_html,
- "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
- None : None
- }
-
- XML_FORMATTERS = {
- "html" : EntitySubstitution.substitute_html,
- "minimal" : EntitySubstitution.substitute_xml,
- None : None
- }
-
- def format_string(self, s, formatter='minimal'):
+ def format_string(self, s, formatter):
"""Format the given string using the given formatter."""
- if not callable(formatter):
- formatter = self._formatter_for_name(formatter)
if formatter is None:
- output = s
- else:
- output = formatter(s)
+ return s
+ if not isinstance(formatter, Formatter):
+ formatter = self.formatter_for_name(formatter)
+ output = formatter.substitute(s)
return output
+ def formatter_for_name(self, formatter):
+ """Look up or create a Formatter for the given identifier,
+ if necessary.
+
+ :param formatter: Can be a Formatter object (used as-is), a
+ function (used as the entity substitution hook for an
+ XMLFormatter or HTMLFormatter), or a string (used to look up
+ an XMLFormatter or HTMLFormatter in the appropriate registry.
+ """
+ if isinstance(formatter, Formatter):
+ return formatter
+ if self._is_xml:
+ c = XMLFormatter
+ else:
+ c = HTMLFormatter
+ if callable(formatter):
+ return c(entity_substitution=formatter)
+ return c.REGISTRY[formatter]
+
@property
def _is_xml(self):
"""Is this element part of an XML tree or an HTML tree?
- This is used when mapping a formatter name ("minimal") to an
- appropriate function (one that performs entity-substitution on
- the contents of <script> and <style> tags, or not). It can be
+ This is used in formatter_for_name, when deciding whether an
+ XMLFormatter or HTMLFormatter is more appropriate. It can be
inefficient, but it should be called very rarely.
"""
if self.known_xml is not None:
@@ -191,48 +185,13 @@ class PageElement(object):
return getattr(self, 'is_xml', False)
return self.parent._is_xml
- def _formatter_for_name(self, name):
- "Look up a formatter function based on its name and the tree."
- if self._is_xml:
- return self.XML_FORMATTERS.get(
- name, EntitySubstitution.substitute_xml)
- else:
- return self.HTML_FORMATTERS.get(
- name, HTMLAwareEntitySubstitution.substitute_xml)
-
- def setup(self, parent=None, previous_element=None, next_element=None,
- previous_sibling=None, next_sibling=None):
- """Sets up the initial relations between this element and
- other elements."""
- self.parent = parent
-
- self.previous_element = previous_element
- if previous_element is not None:
- self.previous_element.next_element = self
-
- self.next_element = next_element
- if self.next_element:
- self.next_element.previous_element = self
-
- self.next_sibling = next_sibling
- if self.next_sibling:
- self.next_sibling.previous_sibling = self
-
- if (not previous_sibling
- and self.parent is not None and self.parent.contents):
- previous_sibling = self.parent.contents[-1]
-
- self.previous_sibling = previous_sibling
- if previous_sibling:
- self.previous_sibling.next_sibling = self
-
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with):
- if not self.parent:
+ if self.parent is None:
raise ValueError(
- "Cannot replace one element with another when the"
+ "Cannot replace one element with another when the "
"element to be replaced is not part of a tree.")
if replace_with is self:
return
@@ -247,7 +206,7 @@ class PageElement(object):
def unwrap(self):
my_parent = self.parent
- if not self.parent:
+ if self.parent is None:
raise ValueError(
"Cannot replace an element with its contents when that"
"element is not part of a tree.")
@@ -295,7 +254,7 @@ class PageElement(object):
def _last_descendant(self, is_initialized=True, accept_self=True):
"Finds the last element beneath this object to be parsed."
- if is_initialized and self.next_sibling:
+ if is_initialized and self.next_sibling is not None:
last_child = self.next_sibling.previous_element
else:
last_child = self
@@ -312,10 +271,18 @@ class PageElement(object):
raise ValueError("Cannot insert None into a tag.")
if new_child is self:
raise ValueError("Cannot insert a tag into itself.")
- if (isinstance(new_child, basestring)
+ if (isinstance(new_child, str)
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
+ from bs4 import BeautifulSoup
+ if isinstance(new_child, BeautifulSoup):
+ # We don't want to end up with a situation where one BeautifulSoup
+ # object contains another. Insert the children one at a time.
+ for subchild in list(new_child.contents):
+ self.insert(position, subchild)
+ position += 1
+ return
position = min(position, len(self.contents))
if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one
@@ -377,43 +344,54 @@ class PageElement(object):
"""Appends the given tag to the contents of this tag."""
self.insert(len(self.contents), tag)
- def insert_before(self, predecessor):
- """Makes the given element the immediate predecessor of this one.
+ def extend(self, tags):
+ """Appends the given tags to the contents of this tag."""
+ for tag in tags:
+ self.append(tag)
- The two elements will have the same parent, and the given element
+ def insert_before(self, *args):
+ """Makes the given element(s) the immediate predecessor of this one.
+
+ The elements will have the same parent, and the given elements
will be immediately before this one.
"""
- if self is predecessor:
- raise ValueError("Can't insert an element before itself.")
parent = self.parent
if parent is None:
raise ValueError(
"Element has no parent, so 'before' has no meaning.")
- # Extract first so that the index won't be screwed up if they
- # are siblings.
- if isinstance(predecessor, PageElement):
- predecessor.extract()
- index = parent.index(self)
- parent.insert(index, predecessor)
-
- def insert_after(self, successor):
- """Makes the given element the immediate successor of this one.
-
- The two elements will have the same parent, and the given element
+ if any(x is self for x in args):
+ raise ValueError("Can't insert an element before itself.")
+ for predecessor in args:
+ # Extract first so that the index won't be screwed up if they
+ # are siblings.
+ if isinstance(predecessor, PageElement):
+ predecessor.extract()
+ index = parent.index(self)
+ parent.insert(index, predecessor)
+
+ def insert_after(self, *args):
+ """Makes the given element(s) the immediate successor of this one.
+
+ The elements will have the same parent, and the given elements
will be immediately after this one.
"""
- if self is successor:
- raise ValueError("Can't insert an element after itself.")
+ # Do all error checking before modifying the tree.
parent = self.parent
if parent is None:
raise ValueError(
"Element has no parent, so 'after' has no meaning.")
- # Extract first so that the index won't be screwed up if they
- # are siblings.
- if isinstance(successor, PageElement):
- successor.extract()
- index = parent.index(self)
- parent.insert(index+1, successor)
+ if any(x is self for x in args):
+ raise ValueError("Can't insert an element after itself.")
+
+ offset = 0
+ for successor in args:
+ # Extract first so that the index won't be screwed up if they
+ # are siblings.
+ if isinstance(successor, PageElement):
+ successor.extract()
+ index = parent.index(self)
+ parent.insert(index+1+offset, successor)
+ offset += 1
def find_next(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
@@ -533,17 +511,24 @@ class PageElement(object):
result = (element for element in generator
if isinstance(element, Tag))
return ResultSet(strainer, result)
- elif isinstance(name, basestring):
+ elif isinstance(name, str):
# Optimization to find all tags with a given name.
if name.count(':') == 1:
- # This is a name with a prefix.
- prefix, name = name.split(':', 1)
+ # This is a name with a prefix. If this is a namespace-aware document,
+ # we need to match the local name against tag.name. If not,
+ # we need to match the fully-qualified name against tag.name.
+ prefix, local_name = name.split(':', 1)
else:
prefix = None
+ local_name = name
result = (element for element in generator
if isinstance(element, Tag)
- and element.name == name
- and (prefix is None or element.prefix == prefix)
+ and (
+ element.name == name
+ ) or (
+ element.name == local_name
+ and (prefix is None or element.prefix == prefix)
+ )
)
return ResultSet(strainer, result)
results = ResultSet(strainer)
@@ -597,82 +582,6 @@ class PageElement(object):
yield i
i = i.parent
- # Methods for supporting CSS selectors.
-
- tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
-
- # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
- # \---------------------------/ \---/\-------------/ \-------/
- # | | | |
- # | | | The value
- # | | ~,|,^,$,* or =
- # | Attribute
- # Tag
- attribselect_re = re.compile(
- r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
- r'=?"?(?P<value>[^\]"]*)"?\]$'
- )
-
- def _attr_value_as_string(self, value, default=None):
- """Force an attribute value into a string representation.
-
- A multi-valued attribute will be converted into a
- space-separated stirng.
- """
- value = self.get(value, default)
- if isinstance(value, list) or isinstance(value, tuple):
- value =" ".join(value)
- return value
-
- def _tag_name_matches_and(self, function, tag_name):
- if not tag_name:
- return function
- else:
- def _match(tag):
- return tag.name == tag_name and function(tag)
- return _match
-
- def _attribute_checker(self, operator, attribute, value=''):
- """Create a function that performs a CSS selector operation.
-
- Takes an operator, attribute and optional value. Returns a
- function that will return True for elements that match that
- combination.
- """
- if operator == '=':
- # string representation of `attribute` is equal to `value`
- return lambda el: el._attr_value_as_string(attribute) == value
- elif operator == '~':
- # space-separated list representation of `attribute`
- # contains `value`
- def _includes_value(element):
- attribute_value = element.get(attribute, [])
- if not isinstance(attribute_value, list):
- attribute_value = attribute_value.split()
- return value in attribute_value
- return _includes_value
- elif operator == '^':
- # string representation of `attribute` starts with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').startswith(value)
- elif operator == '$':
- # string representation of `attribute` ends with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').endswith(value)
- elif operator == '*':
- # string representation of `attribute` contains `value`
- return lambda el: value in el._attr_value_as_string(attribute, '')
- elif operator == '|':
- # string representation of `attribute` is either exactly
- # `value` or starts with `value` and then a dash.
- def _is_or_starts_with_dash(element):
- attribute_value = element._attr_value_as_string(attribute, '')
- return (attribute_value == value or attribute_value.startswith(
- value + '-'))
- return _is_or_starts_with_dash
- else:
- return lambda el: el.has_attr(attribute)
-
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@@ -691,7 +600,7 @@ class PageElement(object):
return self.parents
-class NavigableString(unicode, PageElement):
+class NavigableString(str, PageElement):
PREFIX = ''
SUFFIX = ''
@@ -709,10 +618,10 @@ class NavigableString(unicode, PageElement):
passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters.
"""
- if isinstance(value, unicode):
- u = unicode.__new__(cls, value)
+ if isinstance(value, str):
+ u = str.__new__(cls, value)
else:
- u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+ u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
u.setup()
return u
@@ -723,7 +632,7 @@ class NavigableString(unicode, PageElement):
return type(self)(self)
def __getnewargs__(self):
- return (unicode(self),)
+ return (str(self),)
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
@@ -737,6 +646,7 @@ class NavigableString(unicode, PageElement):
self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"):
+ """Run the string through the provided formatter."""
output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
@@ -755,37 +665,39 @@ class PreformattedString(NavigableString):
but the return value will be ignored.
"""
- def output_ready(self, formatter="minimal"):
- """CData strings are passed into the formatter.
- But the return value is ignored."""
- self.format_string(self, formatter)
+ def output_ready(self, formatter=None):
+ """CData strings are passed into the formatter, purely
+ for any side effects. The return value is ignored.
+ """
+ if formatter is not None:
+ ignore = self.format_string(self, formatter)
return self.PREFIX + self + self.SUFFIX
class CData(PreformattedString):
- PREFIX = u'<![CDATA['
- SUFFIX = u']]>'
+ PREFIX = '<![CDATA['
+ SUFFIX = ']]>'
class ProcessingInstruction(PreformattedString):
"""A SGML processing instruction."""
- PREFIX = u'<?'
- SUFFIX = u'>'
+ PREFIX = '<?'
+ SUFFIX = '>'
class XMLProcessingInstruction(ProcessingInstruction):
"""An XML processing instruction."""
- PREFIX = u'<?'
- SUFFIX = u'?>'
+ PREFIX = '<?'
+ SUFFIX = '?>'
class Comment(PreformattedString):
- PREFIX = u'<!--'
- SUFFIX = u'-->'
+ PREFIX = '<!--'
+ SUFFIX = '-->'
class Declaration(PreformattedString):
- PREFIX = u'<?'
- SUFFIX = u'?>'
+ PREFIX = '<?'
+ SUFFIX = '?>'
class Doctype(PreformattedString):
@@ -802,8 +714,8 @@ class Doctype(PreformattedString):
return Doctype(value)
- PREFIX = u'<!DOCTYPE '
- SUFFIX = u'>\n'
+ PREFIX = '<!DOCTYPE '
+ SUFFIX = '>\n'
class Tag(PageElement):
@@ -826,14 +738,6 @@ class Tag(PageElement):
self.name = name
self.namespace = namespace
self.prefix = prefix
- if builder is not None:
- preserve_whitespace_tags = builder.preserve_whitespace_tags
- else:
- if is_xml:
- preserve_whitespace_tags = []
- else:
- preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
- self.preserve_whitespace_tags = preserve_whitespace_tags
if attrs is None:
attrs = {}
elif attrs:
@@ -856,13 +760,32 @@ class Tag(PageElement):
self.setup(parent, previous)
self.hidden = False
- # Set up any substitutions, such as the charset in a META tag.
- if builder is not None:
+ if builder is None:
+ # In the absence of a TreeBuilder, assume this tag is nothing
+ # special.
+ self.can_be_empty_element = False
+ self.cdata_list_attributes = None
+ else:
+ # Set up any substitutions for this tag, such as the charset in a META tag.
builder.set_up_substitutions(self)
+
+ # Ask the TreeBuilder whether this tag might be an empty-element tag.
self.can_be_empty_element = builder.can_be_empty_element(name)
- else:
- self.can_be_empty_element = False
+ # Keep track of the list of attributes of this tag that
+ # might need to be treated as a list.
+ #
+ # For performance reasons, we store the whole data structure
+ # rather than asking the question of every tag. Asking would
+ # require building a new data structure every time, and
+ # (unlike can_be_empty_element), we almost never need
+ # to check this.
+ self.cdata_list_attributes = builder.cdata_list_attributes
+
+ # Keep track of the names that might cause this tag to be treated as a
+ # whitespace-preserved tag.
+ self.preserve_whitespace_tags = builder.preserve_whitespace_tags
+
parserClass = _alias("parser_class") # BS3
def __copy__(self):
@@ -942,7 +865,7 @@ class Tag(PageElement):
for string in self._all_strings(True):
yield string
- def get_text(self, separator=u"", strip=False,
+ def get_text(self, separator="", strip=False,
types=(NavigableString, CData)):
"""
Get all child strings, concatenated using the given separator.
@@ -976,6 +899,43 @@ class Tag(PageElement):
for element in self.contents[:]:
element.extract()
+ def smooth(self):
+ """Smooth out this element's children by consolidating consecutive strings.
+
+ This makes pretty-printed output look more natural following a
+ lot of operations that modified the tree.
+ """
+ # Mark the first position of every pair of children that need
+ # to be consolidated. Do this rather than making a copy of
+ # self.contents, since in most cases very few strings will be
+ # affected.
+ marked = []
+ for i, a in enumerate(self.contents):
+ if isinstance(a, Tag):
+ # Recursively smooth children.
+ a.smooth()
+ if i == len(self.contents)-1:
+ # This is the last item in .contents, and it's not a
+ # tag. There's no chance it needs any work.
+ continue
+ b = self.contents[i+1]
+ if (isinstance(a, NavigableString)
+ and isinstance(b, NavigableString)
+ and not isinstance(a, PreformattedString)
+ and not isinstance(b, PreformattedString)
+ ):
+ marked.append(i)
+
+ # Go over the marked positions in reverse order, so that
+ # removing items from .contents won't affect the remaining
+ # positions.
+ for i in reversed(marked):
+ a = self.contents[i]
+ b = self.contents[i+1]
+ b.extract()
+ n = NavigableString(a+b)
+ a.replace_with(n)
+
def index(self, element):
"""
Find the index of a child by identity, not value. Avoids issues with
@@ -1021,7 +981,7 @@ class Tag(PageElement):
def __contains__(self, x):
return x in self.contents
- def __nonzero__(self):
+ def __bool__(self):
"A tag is non-None even if it has no contents."
return True
@@ -1046,8 +1006,10 @@ class Tag(PageElement):
# BS3: soup.aTag -> "soup.find("a")
tag_name = tag[:-3]
warnings.warn(
- '.%sTag is deprecated, use .find("%s") instead.' % (
- tag_name, tag_name))
+ '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
+ name=tag_name
+ )
+ )
return self.find(tag_name)
# We special case contents to avoid recursion.
elif not tag.startswith("__") and not tag == "contents":
@@ -1108,14 +1070,6 @@ class Tag(PageElement):
u = self.decode(indent_level, encoding, formatter)
return u.encode(encoding, errors)
- def _should_pretty_print(self, indent_level):
- """Should this tag be pretty-printed?"""
-
- return (
- indent_level is not None
- and self.name not in self.preserve_whitespace_tags
- )
-
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@@ -1129,31 +1083,32 @@ class Tag(PageElement):
encoding.
"""
- # First off, turn a string formatter into a function. This
- # will stop the lookup from happening over and over again.
- if not callable(formatter):
- formatter = self._formatter_for_name(formatter)
-
+ # First off, turn a non-Formatter `formatter` into a Formatter
+ # object. This will stop the lookup from happening over and
+ # over again.
+ if not isinstance(formatter, Formatter):
+ formatter = self.formatter_for_name(formatter)
+ attributes = formatter.attributes(self)
attrs = []
- if self.attrs:
- for key, val in sorted(self.attrs.items()):
- if val is None:
- decoded = key
- else:
- if isinstance(val, list) or isinstance(val, tuple):
- val = ' '.join(val)
- elif not isinstance(val, basestring):
- val = unicode(val)
- elif (
+ for key, val in attributes:
+ if val is None:
+ decoded = key
+ else:
+ if isinstance(val, list) or isinstance(val, tuple):
+ val = ' '.join(val)
+ elif not isinstance(val, str):
+ val = str(val)
+ elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
- and eventual_encoding is not None):
- val = val.encode(eventual_encoding)
-
- text = self.format_string(val, formatter)
- decoded = (
- unicode(key) + '='
- + EntitySubstitution.quoted_attribute_value(text))
- attrs.append(decoded)
+ and eventual_encoding is not None
+ ):
+ val = val.encode(eventual_encoding)
+
+ text = formatter.attribute_value(val)
+ decoded = (
+ str(key) + '='
+ + formatter.quoted_attribute_value(text))
+ attrs.append(decoded)
close = ''
closeTag = ''
@@ -1162,7 +1117,7 @@ class Tag(PageElement):
prefix = self.prefix + ":"
if self.is_empty_element:
- close = '/'
+ close = formatter.void_element_close_prefix or ''
else:
closeTag = '</%s%s>' % (prefix, self.name)
@@ -1177,7 +1132,8 @@ class Tag(PageElement):
else:
indent_contents = None
contents = self.decode_contents(
- indent_contents, eventual_encoding, formatter)
+ indent_contents, eventual_encoding, formatter
+ )
if self.hidden:
# This is the 'document root' object.
@@ -1209,6 +1165,13 @@ class Tag(PageElement):
s = ''.join(s)
return s
+ def _should_pretty_print(self, indent_level):
+ """Should this tag be pretty-printed?"""
+ return (
+ indent_level is not None
+ and self.name not in self.preserve_whitespace_tags
+ )
+
def prettify(self, encoding=None, formatter="minimal"):
if encoding is None:
return self.decode(True, formatter=formatter)
@@ -1224,19 +1187,19 @@ class Tag(PageElement):
indented this many spaces.
:param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
+ encoded into this encoding. decode_contents() is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's
encoding.
- :param formatter: The output formatter responsible for converting
- entities to Unicode characters.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard Formatters.
"""
- # First off, turn a string formatter into a function. This
+ # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
- if not callable(formatter):
- formatter = self._formatter_for_name(formatter)
+ if not isinstance(formatter, Formatter):
+ formatter = self.formatter_for_name(formatter)
pretty_print = (indent_level is not None)
s = []
@@ -1247,16 +1210,19 @@ class Tag(PageElement):
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
- if text and indent_level and not self.name == 'pre':
+ preserve_whitespace = (
+ self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
+ )
+ if text and indent_level and not preserve_whitespace:
text = text.strip()
if text:
- if pretty_print and not self.name == 'pre':
+ if pretty_print and not preserve_whitespace:
s.append(" " * (indent_level - 1))
s.append(text)
- if pretty_print and not self.name == 'pre':
+ if pretty_print and not preserve_whitespace:
s.append("\n")
return ''.join(s)
-
+
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@@ -1331,236 +1297,41 @@ class Tag(PageElement):
current = current.next_element
# CSS selector code
-
- _selector_combinators = ['>', '+', '~']
- _select_debug = False
- quoted_colon = re.compile('"[^"]*:[^"]*"')
- def select_one(self, selector):
+ def select_one(self, selector, namespaces=None, **kwargs):
"""Perform a CSS selection operation on the current element."""
- value = self.select(selector, limit=1)
+ value = self.select(selector, namespaces, 1, **kwargs)
if value:
return value[0]
return None
- def select(self, selector, _candidate_generator=None, limit=None):
- """Perform a CSS selection operation on the current element."""
+ def select(self, selector, namespaces=None, limit=None, **kwargs):
+ """Perform a CSS selection operation on the current element.
- # Handle grouping selectors if ',' exists, ie: p,a
- if ',' in selector:
- context = []
- for partial_selector in selector.split(','):
- partial_selector = partial_selector.strip()
- if partial_selector == '':
- raise ValueError('Invalid group selection syntax: %s' % selector)
- candidates = self.select(partial_selector, limit=limit)
- for candidate in candidates:
- if candidate not in context:
- context.append(candidate)
-
- if limit and len(context) >= limit:
- break
- return context
- tokens = shlex.split(selector)
- current_context = [self]
-
- if tokens[-1] in self._selector_combinators:
- raise ValueError(
- 'Final combinator "%s" is missing an argument.' % tokens[-1])
+ This uses the SoupSieve library.
- if self._select_debug:
- print 'Running CSS selector "%s"' % selector
+ :param selector: A string containing a CSS selector.
- for index, token in enumerate(tokens):
- new_context = []
- new_context_ids = set([])
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
- if tokens[index-1] in self._selector_combinators:
- # This token was consumed by the previous combinator. Skip it.
- if self._select_debug:
- print ' Token was consumed by the previous combinator.'
- continue
+ :param limit: After finding this number of results, stop looking.
- if self._select_debug:
- print ' Considering token "%s"' % token
- recursive_candidate_generator = None
- tag_name = None
-
- # Each operation corresponds to a checker function, a rule
- # for determining whether a candidate matches the
- # selector. Candidates are generated by the active
- # iterator.
- checker = None
-
- m = self.attribselect_re.match(token)
- if m is not None:
- # Attribute selector
- tag_name, attribute, operator, value = m.groups()
- checker = self._attribute_checker(operator, attribute, value)
-
- elif '#' in token:
- # ID selector
- tag_name, tag_id = token.split('#', 1)
- def id_matches(tag):
- return tag.get('id', None) == tag_id
- checker = id_matches
-
- elif '.' in token:
- # Class selector
- tag_name, klass = token.split('.', 1)
- classes = set(klass.split('.'))
- def classes_match(candidate):
- return classes.issubset(candidate.get('class', []))
- checker = classes_match
-
- elif ':' in token and not self.quoted_colon.search(token):
- # Pseudo-class
- tag_name, pseudo = token.split(':', 1)
- if tag_name == '':
- raise ValueError(
- "A pseudo-class must be prefixed with a tag name.")
- pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
- found = []
- if pseudo_attributes is None:
- pseudo_type = pseudo
- pseudo_value = None
- else:
- pseudo_type, pseudo_value = pseudo_attributes.groups()
- if pseudo_type == 'nth-of-type':
- try:
- pseudo_value = int(pseudo_value)
- except:
- raise NotImplementedError(
- 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
- if pseudo_value < 1:
- raise ValueError(
- 'nth-of-type pseudo-class value must be at least 1.')
- class Counter(object):
- def __init__(self, destination):
- self.count = 0
- self.destination = destination
-
- def nth_child_of_type(self, tag):
- self.count += 1
- if self.count == self.destination:
- return True
- else:
- return False
- checker = Counter(pseudo_value).nth_child_of_type
- else:
- raise NotImplementedError(
- 'Only the following pseudo-classes are implemented: nth-of-type.')
-
- elif token == '*':
- # Star selector -- matches everything
- pass
- elif token == '>':
- # Run the next token as a CSS selector against the
- # direct children of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.children
- elif token == '~':
- # Run the next token as a CSS selector against the
- # siblings of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.next_siblings
- elif token == '+':
- # For each tag in the current context, run the next
- # token as a CSS selector against the tag's next
- # sibling that's a tag.
- def next_tag_sibling(tag):
- yield tag.find_next_sibling(True)
- recursive_candidate_generator = next_tag_sibling
-
- elif self.tag_name_re.match(token):
- # Just a tag name.
- tag_name = token
- else:
- raise ValueError(
- 'Unsupported or invalid CSS selector: "%s"' % token)
- if recursive_candidate_generator:
- # This happens when the selector looks like "> foo".
- #
- # The generator calls select() recursively on every
- # member of the current context, passing in a different
- # candidate generator and a different selector.
- #
- # In the case of "> foo", the candidate generator is
- # one that yields a tag's direct children (">"), and
- # the selector is "foo".
- next_token = tokens[index+1]
- def recursive_select(tag):
- if self._select_debug:
- print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
- print '-' * 40
- for i in tag.select(next_token, recursive_candidate_generator):
- if self._select_debug:
- print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
- yield i
- if self._select_debug:
- print '-' * 40
- _use_candidate_generator = recursive_select
- elif _candidate_generator is None:
- # By default, a tag's candidates are all of its
- # children. If tag_name is defined, only yield tags
- # with that name.
- if self._select_debug:
- if tag_name:
- check = "[any]"
- else:
- check = tag_name
- print ' Default candidate generator, tag name="%s"' % check
- if self._select_debug:
- # This is redundant with later code, but it stops
- # a bunch of bogus tags from cluttering up the
- # debug log.
- def default_candidate_generator(tag):
- for child in tag.descendants:
- if not isinstance(child, Tag):
- continue
- if tag_name and not child.name == tag_name:
- continue
- yield child
- _use_candidate_generator = default_candidate_generator
- else:
- _use_candidate_generator = lambda tag: tag.descendants
- else:
- _use_candidate_generator = _candidate_generator
-
- count = 0
- for tag in current_context:
- if self._select_debug:
- print " Running candidate generator on %s %s" % (
- tag.name, repr(tag.attrs))
- for candidate in _use_candidate_generator(tag):
- if not isinstance(candidate, Tag):
- continue
- if tag_name and candidate.name != tag_name:
- continue
- if checker is not None:
- try:
- result = checker(candidate)
- except StopIteration:
- # The checker has decided we should no longer
- # run the generator.
- break
- if checker is None or result:
- if self._select_debug:
- print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
- if id(candidate) not in new_context_ids:
- # If a tag matches a selector more than once,
- # don't include it in the context more than once.
- new_context.append(candidate)
- new_context_ids.add(id(candidate))
- elif self._select_debug:
- print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
-
- current_context = new_context
- if limit and len(current_context) >= limit:
- current_context = current_context[:limit]
-
- if self._select_debug:
- print "Final verdict:"
- for i in current_context:
- print " %s %s" % (i.name, i.attrs)
- return current_context
+ :param kwargs: Any extra arguments you'd like to pass in to
+ soupsieve.select().
+ """
+ if namespaces is None:
+ namespaces = self._namespaces
+
+ if limit is None:
+ limit = 0
+ if soupsieve is None:
+ raise NotImplementedError(
+ "Cannot execute CSS selectors because the soupsieve package is not installed."
+ )
+
+ return soupsieve.select(selector, self, namespaces, limit, **kwargs)
# Old names for backwards compatibility
def childGenerator(self):
@@ -1603,7 +1374,7 @@ class SoupStrainer(object):
else:
attrs = kwargs
normalized_attrs = {}
- for key, value in attrs.items():
+ for key, value in list(attrs.items()):
normalized_attrs[key] = self._normalize_search_value(value)
self.attrs = normalized_attrs
@@ -1612,7 +1383,7 @@ class SoupStrainer(object):
def _normalize_search_value(self, value):
# Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None.
- if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
+ if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
or isinstance(value, bool) or value is None):
return value
@@ -1625,7 +1396,7 @@ class SoupStrainer(object):
new_value = []
for v in value:
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
- and not isinstance(v, unicode)):
+ and not isinstance(v, str)):
# This is almost certainly the user's mistake. In the
# interests of avoiding infinite loops, we'll let
# it through as-is rather than doing a recursive call.
@@ -1637,7 +1408,7 @@ class SoupStrainer(object):
# Otherwise, convert it into a Unicode string.
# The unicode(str()) thing is so this will do the same thing on Python 2
# and Python 3.
- return unicode(str(value))
+ return str(str(value))
def __str__(self):
if self.text:
@@ -1652,7 +1423,7 @@ class SoupStrainer(object):
markup = markup_name
markup_attrs = markup
call_function_with_tag_data = (
- isinstance(self.name, collections.Callable)
+ isinstance(self.name, Callable)
and not isinstance(markup_name, Tag))
if ((not self.name)
@@ -1691,7 +1462,7 @@ class SoupStrainer(object):
found = None
# If given a list of items, scan it for a text element that
# matches.
- if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
+ if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
for element in markup:
if isinstance(element, NavigableString) \
and self.search(element):
@@ -1704,7 +1475,7 @@ class SoupStrainer(object):
found = self.search_tag(markup)
# If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \
- isinstance(markup, basestring):
+ isinstance(markup, str):
if not self.name and not self.attrs and self._matches(markup, self.text):
found = markup
else:
@@ -1732,7 +1503,7 @@ class SoupStrainer(object):
# True matches any non-None value.
return markup is not None
- if isinstance(match_against, collections.Callable):
+ if isinstance(match_against, Callable):
return match_against(markup)
# Custom callables take the tag as an argument, but all
@@ -1749,7 +1520,7 @@ class SoupStrainer(object):
return not match_against
if (hasattr(match_against, '__iter__')
- and not isinstance(match_against, basestring)):
+ and not isinstance(match_against, str)):
# We're asked to match against an iterable of items.
# The markup must be match at least one item in the
# iterable. We'll try each one in turn.
@@ -1776,7 +1547,7 @@ class SoupStrainer(object):
# the tag's name and once against its prefixed name.
match = False
- if not match and isinstance(match_against, unicode):
+ if not match and isinstance(match_against, str):
# Exact string match
match = markup == match_against
diff --git a/libs/bs4/testing.py b/libs/bs4/testing.py
index 6ba2506c4..cc9966601 100644
--- a/libs/bs4/testing.py
+++ b/libs/bs4/testing.py
@@ -1,7 +1,7 @@
+# encoding: utf-8
"""Helper classes for tests."""
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import pickle
@@ -16,29 +16,66 @@ from bs4.element import (
ContentMetaAttributeValue,
Doctype,
SoupStrainer,
+ Tag
)
from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder
+BAD_DOCUMENT = """A bare string
+<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
+<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
+<div><![CDATA[A CDATA section where it doesn't belong]]></div>
+<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
+<div>A <meta> tag</div>
+<div>A <br> tag that supposedly has contents.</br></div>
+<div>AT&T</div>
+<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
+<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
+<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
+<div><a href="http://example.com/</a> that attribute value never got closed</div>
+<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
+<! This document starts with a bogus declaration ><div>a</div>
+<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
+<div>This document ends with <!an incomplete declaration
+<div><a style={height:21px;}>That attribute value was bogus</a></div>
+<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
+<div><table><td nowrap>That boolean attribute had no value</td></table></div>
+<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
+<div>This document ends before the entity finishes: &gt
+<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
+<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
+<div><table><tr><td>Here's a table</td></tr></table></div>
+<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
+<div>This tag contains nothing but whitespace: <b> </b></div>
+<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
+<div><table><div>This table contains bare markup</div></table></div>
+<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
+<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
+<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
+<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
+<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+"""
+
class SoupTest(unittest.TestCase):
@property
def default_builder(self):
- return default_builder()
+ return default_builder
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
- def document_for(self, markup):
+ def document_for(self, markup, **kwargs):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
- return self.default_builder.test_fragment_to_document(markup)
+ return self.default_builder(**kwargs).test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder
@@ -59,6 +96,121 @@ class SoupTest(unittest.TestCase):
self.assertEqual(earlier, e.previous_element)
earlier = e
+ def linkage_validator(self, el, _recursive_call=False):
+ """Ensure proper linkage throughout the document."""
+ descendant = None
+ # Document element should have no previous element or previous sibling.
+ # It also shouldn't have a next sibling.
+ if el.parent is None:
+ assert el.previous_element is None,\
+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ el, el.previous_element, None
+ )
+ assert el.previous_sibling is None,\
+ "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ el, el.previous_sibling, None
+ )
+ assert el.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+ el, el.next_sibling, None
+ )
+
+ idx = 0
+ child = None
+ last_child = None
+ last_idx = len(el.contents) - 1
+ for child in el.contents:
+ descendant = None
+
+ # Parent should link next element to their first child
+ # That child should have no previous sibling
+ if idx == 0:
+ if el.parent is not None:
+ assert el.next_element is child,\
+ "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+ el, el.next_element, child
+ )
+ assert child.previous_element is el,\
+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ child, child.previous_element, el
+ )
+ assert child.previous_sibling is None,\
+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
+ child, child.previous_sibling, None
+ )
+
+ # If not the first child, previous index should link as sibling to this index
+ # Previous element should match the last index or the last bubbled up descendant
+ else:
+ assert child.previous_sibling is el.contents[idx - 1],\
+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
+ child, child.previous_sibling, el.contents[idx - 1]
+ )
+ assert el.contents[idx - 1].next_sibling is child,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
+ )
+
+ if last_child is not None:
+ assert child.previous_element is last_child,\
+ "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
+ child, child.previous_element, last_child, child.parent.contents
+ )
+ assert last_child.next_element is child,\
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ last_child, last_child.next_element, child
+ )
+
+ if isinstance(child, Tag) and child.contents:
+ descendant = self.linkage_validator(child, True)
+ # A bubbled up descendant should have no next siblings
+ assert descendant.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ descendant, descendant.next_sibling, None
+ )
+
+ # Mark last child as either the bubbled up descendant or the current child
+ if descendant is not None:
+ last_child = descendant
+ else:
+ last_child = child
+
+ # If last child, there are non next siblings
+ if idx == last_idx:
+ assert child.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_sibling, None
+ )
+ idx += 1
+
+ child = descendant if descendant is not None else child
+ if child is None:
+ child = el
+
+ if not _recursive_call and child is not None:
+ target = el
+ while True:
+ if target is None:
+ assert child.next_element is None, \
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_element, None
+ )
+ break
+ elif target.next_sibling is not None:
+ assert child.next_element is target.next_sibling, \
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_element, target.next_sibling
+ )
+ break
+ target = target.parent
+
+ # We are done, so nothing to return
+ return None
+ else:
+ # Return the child to the recursive caller
+ return child
+
+
class HTMLTreeBuilderSmokeTest(object):
"""A basic test of a treebuilder's competence.
@@ -80,7 +232,7 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup("")
new_tag = soup.new_tag(name)
self.assertEqual(True, new_tag.is_empty_element)
-
+
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
@@ -150,12 +302,20 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
+ def test_namespaced_html(self):
+ """When a namespaced XML document is parsed as HTML it should
+ be treated as HTML with weird tag names.
+ """
+ markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
+ soup = self.soup(markup)
+ self.assertEqual(2, len(soup.find_all("ns1:foo")))
+
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
# even when the markup is already Unicode and there is no
# need to process anything.
- markup = u"""<?PITarget PIContent?>"""
+ markup = """<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.decode())
@@ -292,6 +452,18 @@ Hello, world!
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+ def test_multivalued_attribute_with_whitespace(self):
+ # Whitespace separating the values of a multi-valued attribute
+ # should be ignored.
+
+ markup = '<div class=" foo bar "></a>'
+ soup = self.soup(markup)
+ self.assertEqual(['foo', 'bar'], soup.div['class'])
+
+ # If you search by the literal name of the class it's like the whitespace
+ # wasn't there.
+ self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
+
def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with
@@ -311,15 +483,41 @@ Hello, world!
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
+ def test_strings_resembling_character_entity_references(self):
+ # "&T" and "&p" look like incomplete character entities, but they are
+ # not.
+ self.assertSoupEquals(
+ "<p>&bull; AT&T is in the s&p 500</p>",
+ "<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
+ )
+
+ def test_apos_entity(self):
+ self.assertSoupEquals(
+ "<p>Bob&apos;s Bar</p>",
+ "<p>Bob's Bar</p>",
+ )
+
+ def test_entities_in_foreign_document_encoding(self):
+ # &#147; and &#148; are invalid numeric entities referencing
+ # Windows-1252 characters. &#45; references a character common
+ # to Windows-1252 and Unicode, and &#9731; references a
+ # character only found in Unicode.
+ #
+ # All of these entities should be converted to Unicode
+ # characters.
+ markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
+ soup = self.soup(markup)
+ self.assertEqual("“Hello” -☃", soup.p.string)
+
def test_entities_in_attributes_converted_to_unicode(self):
- expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
+ expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
- expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
+ expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@@ -330,11 +528,11 @@ Hello, world!
'<p>I said "good day!"</p>')
def test_out_of_range_entity(self):
- expect = u"\N{REPLACEMENT CHARACTER}"
+ expect = "\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
-
+
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
@@ -408,9 +606,9 @@ Hello, world!
# A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the
# encoding found in the declaration! The horror!
- markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
+ markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup)
- self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
+ self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
@@ -450,7 +648,7 @@ Hello, world!
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
- expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
+ expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
@@ -460,15 +658,15 @@ Hello, world!
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
- u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+ "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>")
- self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+ self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
- expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
+ expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
@@ -477,7 +675,7 @@ Hello, world!
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
- unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+ unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
@@ -586,6 +784,13 @@ Hello, world!
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
+ def test_worst_case(self):
+ """Test the worst case (currently) for linking issues."""
+
+ soup = self.soup(BAD_DOCUMENT)
+ self.linkage_validator(soup)
+
+
class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self):
@@ -624,6 +829,17 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual(
soup.encode("utf-8"), markup)
+ def test_nested_namespaces(self):
+ doc = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<parent xmlns="http://ns1/">
+<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
+<grandchild ns3:attr="value" xmlns="http://ns4/"/>
+</child>
+</parent>"""
+ soup = self.soup(doc)
+ self.assertEqual(doc, soup.encode())
+
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
<script type="text/javascript">
@@ -637,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object):
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self):
- markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+ markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup)
- self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
+ self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
self.assertEqual(
- unicode(soup.rss), markup)
+ str(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>")
@@ -676,17 +892,17 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.p), markup)
+ self.assertEqual(str(soup.p), markup)
def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.foo), markup)
+ self.assertEqual(str(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.foo), markup)
+ self.assertEqual(str(soup.foo), markup)
def test_find_by_prefixed_name(self):
doc = """<?xml version="1.0" encoding="utf-8"?>
@@ -721,6 +937,12 @@ class XMLTreeBuilderSmokeTest(object):
# The two tags have the same namespace prefix.
self.assertEqual(tag.prefix, duplicate.prefix)
+ def test_worst_case(self):
+ """Test the worst case (currently) for linking issues."""
+
+ soup = self.soup(BAD_DOCUMENT)
+ self.linkage_validator(soup)
+
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
diff --git a/libs/bs4/tests/test_html5lib.py b/libs/bs4/tests/test_html5lib.py
index 0f89d6244..96529b0b3 100644
--- a/libs/bs4/tests/test_html5lib.py
+++ b/libs/bs4/tests/test_html5lib.py
@@ -5,7 +5,7 @@ import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
-except ImportError, e:
+except ImportError as e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
@@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
@property
def default_builder(self):
- return HTML5TreeBuilder()
+ return HTML5TreeBuilder
def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers.
@@ -74,14 +74,14 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
def test_reparented_markup(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
soup = self.soup(markup)
- self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
+ self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_ends_with_whitespace(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
soup = self.soup(markup)
- self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
+ self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_containing_identical_whitespace_nodes(self):
@@ -127,4 +127,44 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
def test_foster_parenting(self):
markup = b"""<table><td></tbody>A"""
soup = self.soup(markup)
- self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
+ self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
+
+ def test_extraction(self):
+ """
+ Test that extraction does not destroy the tree.
+
+ https://bugs.launchpad.net/beautifulsoup/+bug/1782928
+ """
+
+ markup = """
+<html><head></head>
+<style>
+</style><script></script><body><p>hello</p></body></html>
+"""
+ soup = self.soup(markup)
+ [s.extract() for s in soup('script')]
+ [s.extract() for s in soup('style')]
+
+ self.assertEqual(len(soup.find_all("p")), 1)
+
+ def test_empty_comment(self):
+ """
+ Test that empty comment does not break structure.
+
+ https://bugs.launchpad.net/beautifulsoup/+bug/1806598
+ """
+
+ markup = """
+<html>
+<body>
+<form>
+<!----><input type="text">
+</form>
+</body>
+</html>
+"""
+ soup = self.soup(markup)
+ inputs = []
+ for form in soup.find_all('form'):
+ inputs.extend(form.find_all('input'))
+ self.assertEqual(len(inputs), 1)
diff --git a/libs/bs4/tests/test_htmlparser.py b/libs/bs4/tests/test_htmlparser.py
index d5cf0253f..790489aa1 100644
--- a/libs/bs4/tests/test_htmlparser.py
+++ b/libs/bs4/tests/test_htmlparser.py
@@ -5,12 +5,11 @@ from pdb import set_trace
import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
+from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
- @property
- def default_builder(self):
- return HTMLParserTreeBuilder()
+ default_builder = HTMLParserTreeBuilder
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
@@ -32,3 +31,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
self.assertSoupEquals('</br></br></br>', "")
+
+ def test_empty_element(self):
+ # This verifies that any buffered data present when the parser
+ # finishes working is handled.
+ self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
+
+
+class TestHTMLParserSubclass(SoupTest):
+ def test_error(self):
+ """Verify that our HTMLParser subclass implements error() in a way
+ that doesn't cause a crash.
+ """
+ parser = BeautifulSoupHTMLParser()
+ parser.error("don't crash")
diff --git a/libs/bs4/tests/test_lxml.py b/libs/bs4/tests/test_lxml.py
index a05870b91..29da71149 100644
--- a/libs/bs4/tests/test_lxml.py
+++ b/libs/bs4/tests/test_lxml.py
@@ -7,7 +7,7 @@ try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
-except ImportError, e:
+except ImportError as e:
LXML_PRESENT = False
LXML_VERSION = (0,)
@@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property
def default_builder(self):
- return LXMLTreeBuilder()
+ return LXMLTreeBuilder
def test_out_of_range_entity(self):
self.assertSoupEquals(
@@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+ def test_entities_in_foreign_document_encoding(self):
+ # We can't implement this case correctly because by the time we
+ # hear about markup like "&#147;", it's been (incorrectly) converted into
+ # a string like u'\x93'
+ pass
+
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@@ -62,7 +68,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
# if one is installed.
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />")
- self.assertEqual(u"<b/>", unicode(soup.b))
+ self.assertEqual("<b/>", str(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
@skipIf(
@@ -73,4 +79,22 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
@property
def default_builder(self):
- return LXMLTreeBuilderForXML()
+ return LXMLTreeBuilderForXML
+
+ def test_namespace_indexing(self):
+ # We should not track un-prefixed namespaces as we can only hold one
+ # and it will be recognized as the default namespace by soupsieve,
+ # which may be confusing in some situations. When no namespace is provided
+ # for a selector, the default namespace (if defined) is assumed.
+
+ soup = self.soup(
+ '<?xml version="1.1"?>\n'
+ '<root>'
+ '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
+ '<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
+ '</root>'
+ )
+ self.assertEqual(
+ soup._namespaces,
+ {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
+ )
diff --git a/libs/bs4/tests/test_soup.py b/libs/bs4/tests/test_soup.py
index f3e69edf3..1eda9484b 100644
--- a/libs/bs4/tests/test_soup.py
+++ b/libs/bs4/tests/test_soup.py
@@ -24,6 +24,7 @@ from bs4.dammit import (
EncodingDetector,
)
from bs4.testing import (
+ default_builder,
SoupTest,
skipIf,
)
@@ -32,7 +33,7 @@ import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
-except ImportError, e:
+except ImportError as e:
LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
@@ -40,21 +41,86 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
- data = u"<h1>éé</h1>"
+ data = "<h1>éé</h1>"
soup = self.soup(data)
- self.assertEqual(u"éé", soup.h1.string)
+ self.assertEqual("éé", soup.h1.string)
def test_embedded_null(self):
- data = u"<h1>foo\0bar</h1>"
+ data = "<h1>foo\0bar</h1>"
soup = self.soup(data)
- self.assertEqual(u"foo\0bar", soup.h1.string)
+ self.assertEqual("foo\0bar", soup.h1.string)
def test_exclude_encodings(self):
- utf8_data = u"Räksmörgås".encode("utf-8")
+ utf8_data = "Räksmörgås".encode("utf-8")
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding)
-
+ def test_custom_builder_class(self):
+ # Verify that you can pass in a custom Builder class and
+ # it'll be instantiated with the appropriate keyword arguments.
+ class Mock(object):
+ def __init__(self, **kwargs):
+ self.called_with = kwargs
+ self.is_xml = True
+ def initialize_soup(self, soup):
+ pass
+ def prepare_markup(self, *args, **kwargs):
+ return ''
+
+ kwargs = dict(
+ var="value",
+ # This is a deprecated BS3-era keyword argument, which
+ # will be stripped out.
+ convertEntities=True,
+ )
+ with warnings.catch_warnings(record=True):
+ soup = BeautifulSoup('', builder=Mock, **kwargs)
+ assert isinstance(soup.builder, Mock)
+ self.assertEqual(dict(var="value"), soup.builder.called_with)
+
+ # You can also instantiate the TreeBuilder yourself. In this
+ # case, that specific object is used and any keyword arguments
+ # to the BeautifulSoup constructor are ignored.
+ builder = Mock(**kwargs)
+ with warnings.catch_warnings(record=True) as w:
+ soup = BeautifulSoup(
+ '', builder=builder, ignored_value=True,
+ )
+ msg = str(w[0].message)
+ assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
+ self.assertEqual(builder, soup.builder)
+ self.assertEqual(kwargs, builder.called_with)
+
+ def test_cdata_list_attributes(self):
+ # Most attribute values are represented as scalars, but the
+ # HTML standard says that some attributes, like 'class' have
+ # space-separated lists as values.
+ markup = '<a id=" an id " class=" a class "></a>'
+ soup = self.soup(markup)
+
+ # Note that the spaces are stripped for 'class' but not for 'id'.
+ a = soup.a
+ self.assertEqual(" an id ", a['id'])
+ self.assertEqual(["a", "class"], a['class'])
+
+ # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
+ # you customize or disable this. As always, you can customize the TreeBuilder
+ # by passing in a keyword argument to the BeautifulSoup constructor.
+ soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
+ self.assertEqual(" a class ", soup.a['class'])
+
+ # Here are two ways of saying that `id` is a multi-valued
+ # attribute in this context, but 'class' is not.
+ for switcheroo in ({'*': 'id'}, {'a': 'id'}):
+ with warnings.catch_warnings(record=True) as w:
+ # This will create a warning about not explicitly
+ # specifying a parser, but we'll ignore it.
+ soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
+ a = soup.a
+ self.assertEqual(["an", "id"], a['id'])
+ self.assertEqual(" a class ", a['class'])
+
+
class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
@@ -129,7 +195,7 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
- soup = self.soup(u"http://www.crummyunicode.com/")
+ soup = self.soup("http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
@@ -141,7 +207,7 @@ class TestWarnings(SoupTest):
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
- soup = self.soup(u"http://www.crummyuncode.com/ is great")
+ soup = self.soup("http://www.crummyuncode.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
@@ -163,9 +229,9 @@ class TestEntitySubstitution(unittest.TestCase):
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
- s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
+ s = "foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s),
- u"foo&forall;\N{SNOWMAN}&otilde;bar")
+ "foo&forall;\N{SNOWMAN}&otilde;bar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
@@ -217,7 +283,7 @@ class TestEntitySubstitution(unittest.TestCase):
self.assertEqual(
self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
"&Aacute;T&amp;T")
-
+
def test_quotes_not_html_substituted(self):
"""There's no need to do this except inside attribute values."""
text = 'Bob\'s "bar"'
@@ -230,7 +296,7 @@ class TestEncodingConversion(SoupTest):
def setUp(self):
super(TestEncodingConversion, self).setUp()
- self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
+ self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
@@ -250,7 +316,7 @@ class TestEncodingConversion(SoupTest):
ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
- self.assertTrue(isinstance(unicode_output, unicode))
+ self.assertTrue(isinstance(unicode_output, str))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally:
@@ -262,7 +328,7 @@ class TestEncodingConversion(SoupTest):
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
- self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
+ self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
@@ -270,7 +336,7 @@ class TestEncodingConversion(SoupTest):
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
- self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
+ self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
@@ -281,14 +347,14 @@ class TestEncodingConversion(SoupTest):
PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
- markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
+ markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
- markup = u"I'm already Unicode! \N{SNOWMAN}"
+ markup = "I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
@@ -296,7 +362,7 @@ class TestUnicodeDammit(unittest.TestCase):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
- dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+ dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
@@ -320,14 +386,14 @@ class TestUnicodeDammit(unittest.TestCase):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
- self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
+ self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
- self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
+ self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@@ -336,19 +402,19 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
- utf8_data = u"Räksmörgås".encode("utf-8")
+ utf8_data = "Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
- utf8_data = u"Räksmörgås".encode("utf-8")
+ utf8_data = "Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
- utf8_data = u"Räksmörgås".encode("utf-8")
+ utf8_data = "Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
@@ -364,7 +430,7 @@ class TestUnicodeDammit(unittest.TestCase):
detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
encodings = list(detected.encodings)
- assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
+ assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self):
@@ -404,7 +470,7 @@ class TestUnicodeDammit(unittest.TestCase):
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
- self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+ self.assertTrue("\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
@@ -416,17 +482,17 @@ class TestUnicodeDammit(unittest.TestCase):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
- self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
+ self.assertEqual("<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
- utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
+ utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
- u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
- u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
+ "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
+ "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
@@ -441,7 +507,7 @@ class TestUnicodeDammit(unittest.TestCase):
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
- u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
+ "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
@@ -449,9 +515,9 @@ class TestUnicodeDammit(unittest.TestCase):
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
- u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
- u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
- u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
+ "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
+ "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
+ "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
diff --git a/libs/bs4/tests/test_tree.py b/libs/bs4/tests/test_tree.py
index c0e7c4080..3b4beeb8f 100644
--- a/libs/bs4/tests/test_tree.py
+++ b/libs/bs4/tests/test_tree.py
@@ -1,4 +1,3 @@
-
# -*- coding: utf-8 -*-
"""Tests for Beautiful Soup's tree traversal methods.
@@ -26,6 +25,7 @@ from bs4.element import (
Comment,
Declaration,
Doctype,
+ Formatter,
NavigableString,
SoupStrainer,
Tag,
@@ -71,13 +71,13 @@ class TestFind(TreeTest):
self.assertEqual(soup.find("b").string, "2")
def test_unicode_text_find(self):
- soup = self.soup(u'<h1>Räksmörgås</h1>')
- self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås')
+ soup = self.soup('<h1>Räksmörgås</h1>')
+ self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
def test_unicode_attribute_find(self):
- soup = self.soup(u'<h1 id="Räksmörgås">here it is</h1>')
+ soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
str(soup)
- self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text)
+ self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
def test_find_everything(self):
@@ -97,17 +97,17 @@ class TestFindAll(TreeTest):
"""You can search the tree for text nodes."""
soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
# Exact match.
- self.assertEqual(soup.find_all(string="bar"), [u"bar"])
- self.assertEqual(soup.find_all(text="bar"), [u"bar"])
+ self.assertEqual(soup.find_all(string="bar"), ["bar"])
+ self.assertEqual(soup.find_all(text="bar"), ["bar"])
# Match any of a number of strings.
self.assertEqual(
- soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
+ soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
# Match a regular expression.
self.assertEqual(soup.find_all(text=re.compile('.*')),
- [u"Foo", u"bar", u'\xbb'])
+ ["Foo", "bar", '\xbb'])
# Match anything.
self.assertEqual(soup.find_all(text=True),
- [u"Foo", u"bar", u'\xbb'])
+ ["Foo", "bar", '\xbb'])
def test_find_all_limit(self):
"""You can limit the number of items returned by find_all."""
@@ -250,8 +250,8 @@ class TestFindAllByAttribute(TreeTest):
["Matching a.", "Matching b."])
def test_find_all_by_utf8_attribute_value(self):
- peace = u"םולש".encode("utf8")
- data = u'<a title="םולש"></a>'.encode("utf8")
+ peace = "םולש".encode("utf8")
+ data = '<a title="םולש"></a>'.encode("utf8")
soup = self.soup(data)
self.assertEqual([soup.a], soup.find_all(title=peace))
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
@@ -417,6 +417,48 @@ class TestFindAllByAttribute(TreeTest):
self.assertEqual([], soup.find_all(id=1, text="bar"))
+class TestSmooth(TreeTest):
+ """Test Tag.smooth."""
+
+ def test_smooth(self):
+ soup = self.soup("<div>a</div>")
+ div = soup.div
+ div.append("b")
+ div.append("c")
+ div.append(Comment("Comment 1"))
+ div.append(Comment("Comment 2"))
+ div.append("d")
+ builder = self.default_builder()
+ span = Tag(soup, builder, 'span')
+ span.append('1')
+ span.append('2')
+ div.append(span)
+
+ # At this point the tree has a bunch of adjacent
+ # NavigableStrings. This is normal, but it has no meaning in
+ # terms of HTML, so we may want to smooth things out for
+ # output.
+
+ # Since the <span> tag has two children, its .string is None.
+ self.assertEqual(None, div.span.string)
+
+ self.assertEqual(7, len(div.contents))
+ div.smooth()
+ self.assertEqual(5, len(div.contents))
+
+ # The three strings at the beginning of div.contents have been
+ # merged into on string.
+ #
+ self.assertEqual('abc', div.contents[0])
+
+ # The call is recursive -- the <span> tag was also smoothed.
+ self.assertEqual('12', div.span.string)
+
+ # The two comments have _not_ been merged, even though
+ # comments are strings. Merging comments would change the
+ # meaning of the HTML.
+ self.assertEqual('Comment 1', div.contents[1])
+ self.assertEqual('Comment 2', div.contents[2])
class TestIndex(TreeTest):
@@ -605,7 +647,7 @@ class SiblingTest(TreeTest):
</html>'''
# All that whitespace looks good but makes the tests more
# difficult. Get rid of it.
- markup = re.compile("\n\s*").sub("", markup)
+ markup = re.compile(r"\n\s*").sub("", markup)
self.tree = self.soup(markup)
@@ -703,12 +745,12 @@ class TestTagCreation(SoupTest):
"""Test the ability to create new tags."""
def test_new_tag(self):
soup = self.soup("")
- new_tag = soup.new_tag("foo", bar="baz")
+ new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
self.assertTrue(isinstance(new_tag, Tag))
self.assertEqual("foo", new_tag.name)
- self.assertEqual(dict(bar="baz"), new_tag.attrs)
+ self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
self.assertEqual(None, new_tag.parent)
-
+
def test_tag_inherits_self_closing_rules_from_builder(self):
if XML_BUILDER_PRESENT:
xml_soup = BeautifulSoup("", "lxml-xml")
@@ -821,6 +863,26 @@ class TestTreeModification(SoupTest):
soup = self.soup(text)
self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
+ def test_insert_beautifulsoup_object_inserts_children(self):
+ """Inserting one BeautifulSoup object into another actually inserts all
+ of its children -- you'll never combine BeautifulSoup objects.
+ """
+ soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>")
+
+ text = "<p>p2</p><p>p3</p>"
+ to_insert = self.soup(text)
+ soup.insert(1, to_insert)
+
+ for i in soup.descendants:
+ assert not isinstance(i, BeautifulSoup)
+
+ p1, p2, p3, p4 = list(soup.children)
+ self.assertEqual("And now, a word:", p1.string)
+ self.assertEqual("p2", p2.string)
+ self.assertEqual("p3", p3.string)
+ self.assertEqual("And we're back.", p4.string)
+
+
def test_replace_with_maintains_next_element_throughout(self):
soup = self.soup('<p><a>one</a><b>three</b></p>')
a = soup.a
@@ -877,7 +939,7 @@ class TestTreeModification(SoupTest):
self.assertEqual(soup.a.contents[0].next_element, "bar")
def test_insert_tag(self):
- builder = self.default_builder
+ builder = self.default_builder()
soup = self.soup(
"<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
magic_tag = Tag(soup, builder, 'magictag')
@@ -912,6 +974,13 @@ class TestTreeModification(SoupTest):
soup.a.append(soup.b)
self.assertEqual(data, soup.decode())
+ def test_extend(self):
+ data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>"
+ soup = self.soup(data)
+ l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b]
+ soup.a.extend(l)
+ self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
+
def test_move_tag_to_beginning_of_parent(self):
data = "<a><b></b><c></c><d></d></a>"
soup = self.soup(data)
@@ -938,6 +1007,29 @@ class TestTreeModification(SoupTest):
self.assertEqual(
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
+ # Can't insert an element before itself.
+ b = soup.b
+ self.assertRaises(ValueError, b.insert_before, b)
+
+ # Can't insert before if an element has no parent.
+ b.extract()
+ self.assertRaises(ValueError, b.insert_before, "nope")
+
+ # Can insert an identical element
+ soup = self.soup("<a>")
+ soup.a.insert_before(soup.new_tag("a"))
+
+ def test_insert_multiple_before(self):
+ soup = self.soup("<a>foo</a><b>bar</b>")
+ soup.b.insert_before("BAZ", " ", "QUUX")
+ soup.a.insert_before("QUUX", " ", "BAZ")
+ self.assertEqual(
+ soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>"))
+
+ soup.a.insert_before(soup.b, "FOO")
+ self.assertEqual(
+ soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX"))
+
def test_insert_after(self):
soup = self.soup("<a>foo</a><b>bar</b>")
soup.b.insert_after("BAZ")
@@ -948,6 +1040,28 @@ class TestTreeModification(SoupTest):
self.assertEqual(
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
+ # Can't insert an element after itself.
+ b = soup.b
+ self.assertRaises(ValueError, b.insert_after, b)
+
+ # Can't insert after if an element has no parent.
+ b.extract()
+ self.assertRaises(ValueError, b.insert_after, "nope")
+
+ # Can insert an identical element
+ soup = self.soup("<a>")
+ soup.a.insert_before(soup.new_tag("a"))
+
+ def test_insert_multiple_after(self):
+ soup = self.soup("<a>foo</a><b>bar</b>")
+ soup.b.insert_after("BAZ", " ", "QUUX")
+ soup.a.insert_after("QUUX", " ", "BAZ")
+ self.assertEqual(
+ soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX"))
+ soup.b.insert_after(soup.a, "FOO ")
+ self.assertEqual(
+ soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX"))
+
def test_insert_after_raises_exception_if_after_has_no_meaning(self):
soup = self.soup("")
tag = soup.new_tag("a")
@@ -1111,7 +1225,7 @@ class TestTreeModification(SoupTest):
<script>baz</script>
</html>""")
[soup.script.extract() for i in soup.find_all("script")]
- self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body))
+ self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
@@ -1186,7 +1300,7 @@ class TestElementObjects(SoupTest):
tag = soup.bTag
self.assertEqual(soup.b, tag)
self.assertEqual(
- '.bTag is deprecated, use .find("b") instead.',
+ '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
str(w[0].message))
def test_has_attr(self):
@@ -1349,19 +1463,19 @@ class TestPersistence(SoupTest):
soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
encoding = soup.original_encoding
copy = soup.__copy__()
- self.assertEqual(u"<p> </p>", unicode(copy))
+ self.assertEqual("<p> </p>", str(copy))
self.assertEqual(encoding, copy.original_encoding)
def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled.
- html = u"<b>\N{SNOWMAN}</b>"
+ html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.decode(), soup.decode())
def test_copy_navigablestring_is_not_attached_to_tree(self):
- html = u"<b>Foo<a></a></b><b>Bar</b>"
+ html = "<b>Foo<a></a></b><b>Bar</b>"
soup = self.soup(html)
s1 = soup.find(string="Foo")
s2 = copy.copy(s1)
@@ -1373,7 +1487,7 @@ class TestPersistence(SoupTest):
self.assertEqual(None, s2.previous_element)
def test_copy_navigablestring_subclass_has_same_type(self):
- html = u"<b><!--Foo--></b>"
+ html = "<b><!--Foo--></b>"
soup = self.soup(html)
s1 = soup.string
s2 = copy.copy(s1)
@@ -1381,19 +1495,19 @@ class TestPersistence(SoupTest):
self.assertTrue(isinstance(s2, Comment))
def test_copy_entire_soup(self):
- html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
+ html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
soup = self.soup(html)
soup_copy = copy.copy(soup)
self.assertEqual(soup, soup_copy)
def test_copy_tag_copies_contents(self):
- html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
+ html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
soup = self.soup(html)
div = soup.div
div_copy = copy.copy(div)
# The two tags look the same, and evaluate to equal.
- self.assertEqual(unicode(div), unicode(div_copy))
+ self.assertEqual(str(div), str(div_copy))
self.assertEqual(div, div_copy)
# But they're not the same object.
@@ -1409,67 +1523,75 @@ class TestPersistence(SoupTest):
class TestSubstitutions(SoupTest):
def test_default_formatter_is_minimal(self):
- markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="minimal")
# The < is converted back into &lt; but the e-with-acute is left alone.
self.assertEqual(
decoded,
self.document_for(
- u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
+ "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_html(self):
- markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="html")
self.assertEqual(
decoded,
- self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+ self.document_for("<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+ def test_formatter_html5(self):
+ markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ soup = self.soup(markup)
+ decoded = soup.decode(formatter="html5")
+ self.assertEqual(
+ decoded,
+ self.document_for("<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+
def test_formatter_minimal(self):
- markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="minimal")
# The < is converted back into &lt; but the e-with-acute is left alone.
self.assertEqual(
decoded,
self.document_for(
- u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
+ "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_null(self):
- markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter=None)
# Neither the angle brackets nor the e-with-acute are converted.
# This is not valid HTML, but it's what the user wanted.
self.assertEqual(decoded,
- self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
+ self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
def test_formatter_custom(self):
- markup = u"<b>&lt;foo&gt;</b><b>bar</b>"
+ markup = "<b>&lt;foo&gt;</b><b>bar</b><br/>"
soup = self.soup(markup)
decoded = soup.decode(formatter = lambda x: x.upper())
# Instead of normal entity conversion code, the custom
# callable is called on every string.
self.assertEqual(
decoded,
- self.document_for(u"<b><FOO></b><b>BAR</b>"))
+ self.document_for("<b><FOO></b><b>BAR</b><br/>"))
def test_formatter_is_run_on_attribute_values(self):
- markup = u'<a href="http://a.com?a=b&c=é">e</a>'
+ markup = '<a href="http://a.com?a=b&c=é">e</a>'
soup = self.soup(markup)
a = soup.a
- expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>'
+ expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
self.assertEqual(expect_minimal, a.decode())
self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
- expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
+ expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
self.assertEqual(expect_html, a.decode(formatter="html"))
self.assertEqual(markup, a.decode(formatter=None))
- expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
+ expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
def test_formatter_skips_script_tag_for_html_documents(self):
@@ -1491,28 +1613,28 @@ class TestSubstitutions(SoupTest):
self.assertTrue(b"< < hey > >" in encoded)
def test_prettify_leaves_preformatted_text_alone(self):
- soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
+ soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
# Everything outside the <pre> tag is reformatted, but everything
# inside is left alone.
self.assertEqual(
- u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
+ '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
soup.div.prettify())
- def test_prettify_accepts_formatter(self):
+ def test_prettify_accepts_formatter_function(self):
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
pretty = soup.prettify(formatter = lambda x: x.upper())
self.assertTrue("FOO" in pretty)
def test_prettify_outputs_unicode_by_default(self):
soup = self.soup("<a></a>")
- self.assertEqual(unicode, type(soup.prettify()))
+ self.assertEqual(str, type(soup.prettify()))
def test_prettify_can_encode_data(self):
soup = self.soup("<a></a>")
self.assertEqual(bytes, type(soup.prettify("utf-8")))
def test_html_entity_substitution_off_by_default(self):
- markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
+ markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
soup = self.soup(markup)
encoded = soup.b.encode("utf-8")
self.assertEqual(encoded, markup.encode('utf-8'))
@@ -1556,54 +1678,77 @@ class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings."""
def test_unicode_string_can_be_encoded(self):
- html = u"<b>\N{SNOWMAN}</b>"
+ html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(soup.b.string.encode("utf-8"),
- u"\N{SNOWMAN}".encode("utf-8"))
+ "\N{SNOWMAN}".encode("utf-8"))
def test_tag_containing_unicode_string_can_be_encoded(self):
- html = u"<b>\N{SNOWMAN}</b>"
+ html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(
soup.b.encode("utf-8"), html.encode("utf-8"))
def test_encoding_substitutes_unrecognized_characters_by_default(self):
- html = u"<b>\N{SNOWMAN}</b>"
+ html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
def test_encoding_can_be_made_strict(self):
- html = u"<b>\N{SNOWMAN}</b>"
+ html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertRaises(
UnicodeEncodeError, soup.encode, "ascii", errors="strict")
def test_decode_contents(self):
- html = u"<b>\N{SNOWMAN}</b>"
+ html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
- self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents())
+ self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
def test_encode_contents(self):
- html = u"<b>\N{SNOWMAN}</b>"
+ html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(
- u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
+ "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
encoding="utf8"))
def test_deprecated_renderContents(self):
- html = u"<b>\N{SNOWMAN}</b>"
+ html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(
- u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
+ "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
def test_repr(self):
- html = u"<b>\N{SNOWMAN}</b>"
+ html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
if PY3K:
self.assertEqual(html, repr(soup))
else:
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
+class TestFormatter(SoupTest):
+
+ def test_sort_attributes(self):
+ # Test the ability to override Formatter.attributes() to,
+ # e.g., disable the normal sorting of attributes.
+ class UnsortedFormatter(Formatter):
+ def attributes(self, tag):
+ self.called_with = tag
+ for k, v in sorted(tag.attrs.items()):
+ if k == 'ignore':
+ continue
+ yield k,v
+
+ soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
+ formatter = UnsortedFormatter()
+ decoded = soup.decode(formatter=formatter)
+
+ # attributes() was called on the <p> tag. It filtered out one
+ # attribute and sorted the other two.
+ self.assertEqual(formatter.called_with, soup.p)
+ self.assertEqual('<p aval="2" cval="1"></p>', decoded)
+
+
class TestNavigableStringSubclasses(SoupTest):
def test_cdata(self):
@@ -1720,7 +1865,7 @@ class TestSoupSelector(TreeTest):
els = self.soup.select('title')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].name, 'title')
- self.assertEqual(els[0].contents, [u'The title'])
+ self.assertEqual(els[0].contents, ['The title'])
def test_one_tag_many(self):
els = self.soup.select('div')
@@ -1755,7 +1900,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self):
- self.assertRaises(ValueError, self.soup.select, 'tag%t')
+ self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
def test_select_dashed_tag_ids(self):
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@@ -1766,7 +1911,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(dashed[0]['id'], 'dash2')
def test_dashed_tag_text(self):
- self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.')
+ self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
def test_select_dashed_matches_find_all(self):
self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
@@ -1946,32 +2091,31 @@ class TestSoupSelector(TreeTest):
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
self.assertRaises(
- NotImplementedError, self.soup.select, "a:nth-of-type(a)")
-
+ SyntaxError, self.soup.select, "a:nth-of-type(a)")
def test_nth_of_type(self):
# Try to select first paragraph
els = self.soup.select('div#inner p:nth-of-type(1)')
self.assertEqual(len(els), 1)
- self.assertEqual(els[0].string, u'Some text')
+ self.assertEqual(els[0].string, 'Some text')
# Try to select third paragraph
els = self.soup.select('div#inner p:nth-of-type(3)')
self.assertEqual(len(els), 1)
- self.assertEqual(els[0].string, u'Another')
+ self.assertEqual(els[0].string, 'Another')
# Try to select (non-existent!) fourth paragraph
els = self.soup.select('div#inner p:nth-of-type(4)')
self.assertEqual(len(els), 0)
- # Pass in an invalid value.
- self.assertRaises(
- ValueError, self.soup.select, 'div p:nth-of-type(0)')
+ # Zero will select no tags.
+ els = self.soup.select('div p:nth-of-type(0)')
+ self.assertEqual(len(els), 0)
def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)')
self.assertEqual(len(els), 1)
- self.assertEqual(els[0].string, u'Some text')
+ self.assertEqual(els[0].string, 'Some text')
def test_id_child_selector_nth_of_type(self):
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
@@ -2003,7 +2147,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self):
- self.assertRaises(ValueError, self.soup.select, 'h1 >')
+ self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@@ -2034,8 +2178,8 @@ class TestSoupSelector(TreeTest):
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self):
- self.assertRaises(ValueError, self.soup.select, ',x, y')
- self.assertRaises(ValueError, self.soup.select, 'x,,y')
+ self.assertRaises(SyntaxError, self.soup.select, ',x, y')
+ self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
def test_multiple_select_attrs(self):
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
@@ -2046,5 +2190,16 @@ class TestSoupSelector(TreeTest):
def test_multiple_select_nested(self):
self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
-
-
+ def test_select_duplicate_elements(self):
+ # When markup contains duplicate elements, a multiple select
+ # will find all of them.
+ markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
+ soup = BeautifulSoup(markup, 'html.parser')
+ selected = soup.select(".c1, .c2")
+ self.assertEqual(3, len(selected))
+
+ # Verify that find_all finds the same elements, though because
+ # of an implementation detail it finds them in a different
+ # order.
+ for element in soup.find_all(class_=['c1', 'c2']):
+ assert element in selected