diff options
author | Louis Vézina <[email protected]> | 2019-09-13 15:12:26 -0400 |
---|---|---|
committer | Louis Vézina <[email protected]> | 2019-09-13 15:12:26 -0400 |
commit | 645952c61aba3cccb5ca919be966a7ba02d853fa (patch) | |
tree | 005362d11c260b4293b427ed8ba9d94e8b478ec3 /libs/bs4 | |
parent | 4e7e3a39d26cb3771876b43c82a747f6c6526dda (diff) | |
download | bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.tar.gz bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.zip |
WIP
Diffstat (limited to 'libs/bs4')
-rw-r--r-- | libs/bs4/AUTHORS.txt | 43 | ||||
-rw-r--r-- | libs/bs4/COPYING.txt | 27 | ||||
-rw-r--r-- | libs/bs4/NEWS.txt | 1190 | ||||
-rw-r--r-- | libs/bs4/README.txt | 63 | ||||
-rw-r--r-- | libs/bs4/TODO.txt | 31 | ||||
-rw-r--r-- | libs/bs4/__init__.py | 267 | ||||
-rw-r--r-- | libs/bs4/builder/__init__.py | 74 | ||||
-rw-r--r-- | libs/bs4/builder/_html5lib.py | 40 | ||||
-rw-r--r-- | libs/bs4/builder/_htmlparser.py | 72 | ||||
-rw-r--r-- | libs/bs4/builder/_lxml.py | 90 | ||||
-rw-r--r-- | libs/bs4/dammit.py | 40 | ||||
-rw-r--r-- | libs/bs4/diagnose.py | 77 | ||||
-rw-r--r-- | libs/bs4/element.py | 859 | ||||
-rw-r--r-- | libs/bs4/testing.py | 270 | ||||
-rw-r--r-- | libs/bs4/tests/test_html5lib.py | 50 | ||||
-rw-r--r-- | libs/bs4/tests/test_htmlparser.py | 19 | ||||
-rw-r--r-- | libs/bs4/tests/test_lxml.py | 32 | ||||
-rw-r--r-- | libs/bs4/tests/test_soup.py | 134 | ||||
-rw-r--r-- | libs/bs4/tests/test_tree.py | 301 |
19 files changed, 1412 insertions, 2267 deletions
diff --git a/libs/bs4/AUTHORS.txt b/libs/bs4/AUTHORS.txt deleted file mode 100644 index 2ac8fcc8c..000000000 --- a/libs/bs4/AUTHORS.txt +++ /dev/null @@ -1,43 +0,0 @@ -Behold, mortal, the origins of Beautiful Soup... -================================================ - -Leonard Richardson is the primary programmer. - -Aaron DeVore is awesome. - -Mark Pilgrim provided the encoding detection code that forms the base -of UnicodeDammit. - -Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful -Soup 4 working under Python 3. - -Simon Willison wrote soupselect, which was used to make Beautiful Soup -support CSS selectors. - -Sam Ruby helped with a lot of edge cases. - -Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his -work in solving the nestable tags conundrum. - -An incomplete list of people have contributed patches to Beautiful -Soup: - - Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, - Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris - Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, - Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed - Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko - Samastur, Jouni Sepp�nen, Alexander Schmolck, Andy Theyers, Glyn - Webster, Paul Wright, Danny Yoo - -An incomplete list of people who made suggestions or found bugs or -found ways to break Beautiful Soup: - - Hanno B�ck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, - Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, - Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams, - warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, - Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed - Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart - Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de - Sousa Rocha, Yichun Wei, Per Vognsen diff --git a/libs/bs4/COPYING.txt b/libs/bs4/COPYING.txt deleted file mode 100644 index b91188869..000000000 --- a/libs/bs4/COPYING.txt +++ /dev/null @@ -1,27 +0,0 @@ -Beautiful Soup is made available under the MIT license: - - Copyright (c) 2004-2015 Leonard Richardson - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - -Beautiful Soup incorporates code from the html5lib library, which is -also made available under the MIT license. Copyright (c) 2006-2013 -James Graham and other contributors diff --git a/libs/bs4/NEWS.txt b/libs/bs4/NEWS.txt deleted file mode 100644 index 3726c570a..000000000 --- a/libs/bs4/NEWS.txt +++ /dev/null @@ -1,1190 +0,0 @@ -= 4.4.1 (20150928) = - -* Fixed a bug that deranged the tree when part of it was - removed. Thanks to Eric Weiser for the patch and John Wiseman for a - test. [bug=1481520] - -* Fixed a parse bug with the html5lib tree-builder. Thanks to Roel - Kramer for the patch. [bug=1483781] - -* Improved the implementation of CSS selector grouping. Thanks to - Orangain for the patch. [bug=1484543] - -* Fixed the test_detect_utf8 test so that it works when chardet is - installed. [bug=1471359] - -* Corrected the output of Declaration objects. [bug=1477847] - - -= 4.4.0 (20150703) = - -Especially important changes: - -* Added a warning when you instantiate a BeautifulSoup object without - explicitly naming a parser. [bug=1398866] - -* __repr__ now returns an ASCII bytestring in Python 2, and a Unicode - string in Python 3, instead of a UTF8-encoded bytestring in both - versions. In Python 3, __str__ now returns a Unicode string instead - of a bytestring. [bug=1420131] - -* The `text` argument to the find_* methods is now called `string`, - which is more accurate. `text` still works, but `string` is the - argument described in the documentation. `text` may eventually - change its meaning, but not for a very long time. [bug=1366856] - -* Changed the way soup objects work under copy.copy(). Copying a - NavigableString or a Tag will give you a new NavigableString that's - equal to the old one but not connected to the parse tree. Patch by - Martijn Peters. [bug=1307490] - -* Started using a standard MIT license. [bug=1294662] - -* Added a Chinese translation of the documentation by Delong .w. - -New features: - -* Introduced the select_one() method, which uses a CSS selector but - only returns the first match, instead of a list of - matches. [bug=1349367] - -* You can now create a Tag object without specifying a - TreeBuilder. Patch by Martijn Pieters. [bug=1307471] - -* You can now create a NavigableString or a subclass just by invoking - the constructor. [bug=1294315] - -* Added an `exclude_encodings` argument to UnicodeDammit and to the - Beautiful Soup constructor, which lets you prohibit the detection of - an encoding that you know is wrong. [bug=1469408] - -* The select() method now supports selector grouping. Patch by - Francisco Canas [bug=1191917] - -Bug fixes: - -* Fixed yet another problem that caused the html5lib tree builder to - create a disconnected parse tree. [bug=1237763] - -* Force object_was_parsed() to keep the tree intact even when an element - from later in the document is moved into place. [bug=1430633] - -* Fixed yet another bug that caused a disconnected tree when html5lib - copied an element from one part of the tree to another. [bug=1270611] - -* Fixed a bug where Element.extract() could create an infinite loop in - the remaining tree. - -* The select() method can now find tags whose names contain - dashes. Patch by Francisco Canas. [bug=1276211] - -* The select() method can now find tags with attributes whose names - contain dashes. Patch by Marek Kapolka. [bug=1304007] - -* Improved the lxml tree builder's handling of processing - instructions. [bug=1294645] - -* Restored the helpful syntax error that happens when you try to - import the Python 2 edition of Beautiful Soup under Python - 3. [bug=1213387] - -* In Python 3.4 and above, set the new convert_charrefs argument to - the html.parser constructor to avoid a warning and future - failures. Patch by Stefano Revera. [bug=1375721] - -* The warning when you pass in a filename or URL as markup will now be - displayed correctly even if the filename or URL is a Unicode - string. [bug=1268888] - -* If the initial <html> tag contains a CDATA list attribute such as - 'class', the html5lib tree builder will now turn its value into a - list, as it would with any other tag. [bug=1296481] - -* Fixed an import error in Python 3.5 caused by the removal of the - HTMLParseError class. [bug=1420063] - -* Improved docstring for encode_contents() and - decode_contents(). [bug=1441543] - -* Fixed a crash in Unicode, Dammit's encoding detector when the name - of the encoding itself contained invalid bytes. [bug=1360913] - -* Improved the exception raised when you call .unwrap() or - .replace_with() on an element that's not attached to a tree. - -* Raise a NotImplementedError whenever an unsupported CSS pseudoclass - is used in select(). Previously some cases did not result in a - NotImplementedError. - -* It's now possible to pickle a BeautifulSoup object no matter which - tree builder was used to create it. However, the only tree builder - that survives the pickling process is the HTMLParserTreeBuilder - ('html.parser'). If you unpickle a BeautifulSoup object created with - some other tree builder, soup.builder will be None. [bug=1231545] - -= 4.3.2 (20131002) = - -* Fixed a bug in which short Unicode input was improperly encoded to - ASCII when checking whether or not it was the name of a file on - disk. [bug=1227016] - -* Fixed a crash when a short input contains data not valid in - filenames. [bug=1232604] - -* Fixed a bug that caused Unicode data put into UnicodeDammit to - return None instead of the original data. [bug=1214983] - -* Combined two tests to stop a spurious test failure when tests are - run by nosetests. [bug=1212445] - -= 4.3.1 (20130815) = - -* Fixed yet another problem with the html5lib tree builder, caused by - html5lib's tendency to rearrange the tree during - parsing. [bug=1189267] - -* Fixed a bug that caused the optimized version of find_all() to - return nothing. [bug=1212655] - -= 4.3.0 (20130812) = - -* Instead of converting incoming data to Unicode and feeding it to the - lxml tree builder in chunks, Beautiful Soup now makes successive - guesses at the encoding of the incoming data, and tells lxml to - parse the data as that encoding. Giving lxml more control over the - parsing process improves performance and avoids a number of bugs and - issues with the lxml parser which had previously required elaborate - workarounds: - - - An issue in which lxml refuses to parse Unicode strings on some - systems. [bug=1180527] - - - A returning bug that truncated documents longer than a (very - small) size. [bug=963880] - - - A returning bug in which extra spaces were added to a document if - the document defined a charset other than UTF-8. [bug=972466] - - This required a major overhaul of the tree builder architecture. If - you wrote your own tree builder and didn't tell me, you'll need to - modify your prepare_markup() method. - -* The UnicodeDammit code that makes guesses at encodings has been - split into its own class, EncodingDetector. A lot of apparently - redundant code has been removed from Unicode, Dammit, and some - undocumented features have also been removed. - -* Beautiful Soup will issue a warning if instead of markup you pass it - a URL or the name of a file on disk (a common beginner's mistake). - -* A number of optimizations improve the performance of the lxml tree - builder by about 33%, the html.parser tree builder by about 20%, and - the html5lib tree builder by about 15%. - -* All find_all calls should now return a ResultSet object. Patch by - Aaron DeVore. [bug=1194034] - -= 4.2.1 (20130531) = - -* The default XML formatter will now replace ampersands even if they - appear to be part of entities. That is, "<" will become - "&lt;". The old code was left over from Beautiful Soup 3, which - didn't always turn entities into Unicode characters. - - If you really want the old behavior (maybe because you add new - strings to the tree, those strings include entities, and you want - the formatter to leave them alone on output), it can be found in - EntitySubstitution.substitute_xml_containing_entities(). [bug=1182183] - -* Gave new_string() the ability to create subclasses of - NavigableString. [bug=1181986] - -* Fixed another bug by which the html5lib tree builder could create a - disconnected tree. [bug=1182089] - -* The .previous_element of a BeautifulSoup object is now always None, - not the last element to be parsed. [bug=1182089] - -* Fixed test failures when lxml is not installed. [bug=1181589] - -* html5lib now supports Python 3. Fixed some Python 2-specific - code in the html5lib test suite. [bug=1181624] - -* The html.parser treebuilder can now handle numeric attributes in - text when the hexidecimal name of the attribute starts with a - capital X. Patch by Tim Shirley. [bug=1186242] - -= 4.2.0 (20130514) = - -* The Tag.select() method now supports a much wider variety of CSS - selectors. - - - Added support for the adjacent sibling combinator (+) and the - general sibling combinator (~). Tests by "liquider". [bug=1082144] - - - The combinators (>, +, and ~) can now combine with any supported - selector, not just one that selects based on tag name. - - - Added limited support for the "nth-of-type" pseudo-class. Code - by Sven Slootweg. [bug=1109952] - -* The BeautifulSoup class is now aliased to "_s" and "_soup", making - it quicker to type the import statement in an interactive session: - - from bs4 import _s - or - from bs4 import _soup - - The alias may change in the future, so don't use this in code you're - going to run more than once. - -* Added the 'diagnose' submodule, which includes several useful - functions for reporting problems and doing tech support. - - - diagnose(data) tries the given markup on every installed parser, - reporting exceptions and displaying successes. If a parser is not - installed, diagnose() mentions this fact. - - - lxml_trace(data, html=True) runs the given markup through lxml's - XML parser or HTML parser, and prints out the parser events as - they happen. This helps you quickly determine whether a given - problem occurs in lxml code or Beautiful Soup code. - - - htmlparser_trace(data) is the same thing, but for Python's - built-in HTMLParser class. - -* In an HTML document, the contents of a <script> or <style> tag will - no longer undergo entity substitution by default. XML documents work - the same way they did before. [bug=1085953] - -* Methods like get_text() and properties like .strings now only give - you strings that are visible in the document--no comments or - processing commands. [bug=1050164] - -* The prettify() method now leaves the contents of <pre> tags - alone. [bug=1095654] - -* Fix a bug in the html5lib treebuilder which sometimes created - disconnected trees. [bug=1039527] - -* Fix a bug in the lxml treebuilder which crashed when a tag included - an attribute from the predefined "xml:" namespace. [bug=1065617] - -* Fix a bug by which keyword arguments to find_parent() were not - being passed on. [bug=1126734] - -* Stop a crash when unwisely messing with a tag that's been - decomposed. [bug=1097699] - -* Now that lxml's segfault on invalid doctype has been fixed, fixed a - corresponding problem on the Beautiful Soup end that was previously - invisible. [bug=984936] - -* Fixed an exception when an overspecified CSS selector didn't match - anything. Code by Stefaan Lippens. [bug=1168167] - -= 4.1.3 (20120820) = - -* Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious - test failure caused by the lousy HTMLParser in those - versions. [bug=1038503] - -* Raise a more specific error (FeatureNotFound) when a requested - parser or parser feature is not installed. Raise NotImplementedError - instead of ValueError when the user calls insert_before() or - insert_after() on the BeautifulSoup object itself. Patch by Aaron - Devore. [bug=1038301] - -= 4.1.2 (20120817) = - -* As per PEP-8, allow searching by CSS class using the 'class_' - keyword argument. [bug=1037624] - -* Display namespace prefixes for namespaced attribute names, instead of - the fully-qualified names given by the lxml parser. [bug=1037597] - -* Fixed a crash on encoding when an attribute name contained - non-ASCII characters. - -* When sniffing encodings, if the cchardet library is installed, - Beautiful Soup uses it instead of chardet. cchardet is much - faster. [bug=1020748] - -* Use logging.warning() instead of warning.warn() to notify the user - that characters were replaced with REPLACEMENT - CHARACTER. [bug=1013862] - -= 4.1.1 (20120703) = - -* Fixed an html5lib tree builder crash which happened when html5lib - moved a tag with a multivalued attribute from one part of the tree - to another. [bug=1019603] - -* Correctly display closing tags with an XML namespace declared. Patch - by Andreas Kostyrka. [bug=1019635] - -* Fixed a typo that made parsing significantly slower than it should - have been, and also waited too long to close tags with XML - namespaces. [bug=1020268] - -* get_text() now returns an empty Unicode string if there is no text, - rather than an empty bytestring. [bug=1020387] - -= 4.1.0 (20120529) = - -* Added experimental support for fixing Windows-1252 characters - embedded in UTF-8 documents. (UnicodeDammit.detwingle()) - -* Fixed the handling of " with the built-in parser. [bug=993871] - -* Comments, processing instructions, document type declarations, and - markup declarations are now treated as preformatted strings, the way - CData blocks are. [bug=1001025] - -* Fixed a bug with the lxml treebuilder that prevented the user from - adding attributes to a tag that didn't originally have - attributes. [bug=1002378] Thanks to Oliver Beattie for the patch. - -* Fixed some edge-case bugs having to do with inserting an element - into a tag it's already inside, and replacing one of a tag's - children with another. [bug=997529] - -* Added the ability to search for attribute values specified in UTF-8. [bug=1003974] - - This caused a major refactoring of the search code. All the tests - pass, but it's possible that some searches will behave differently. - -= 4.0.5 (20120427) = - -* Added a new method, wrap(), which wraps an element in a tag. - -* Renamed replace_with_children() to unwrap(), which is easier to - understand and also the jQuery name of the function. - -* Made encoding substitution in <meta> tags completely transparent (no - more %SOUP-ENCODING%). - -* Fixed a bug in decoding data that contained a byte-order mark, such - as data encoded in UTF-16LE. [bug=988980] - -* Fixed a bug that made the HTMLParser treebuilder generate XML - definitions ending with two question marks instead of - one. [bug=984258] - -* Upon document generation, CData objects are no longer run through - the formatter. [bug=988905] - -* The test suite now passes when lxml is not installed, whether or not - html5lib is installed. [bug=987004] - -* Print a warning on HTMLParseErrors to let people know they should - install a better parser library. - -= 4.0.4 (20120416) = - -* Fixed a bug that sometimes created disconnected trees. - -* Fixed a bug with the string setter that moved a string around the - tree instead of copying it. [bug=983050] - -* Attribute values are now run through the provided output formatter. - Previously they were always run through the 'minimal' formatter. In - the future I may make it possible to specify different formatters - for attribute values and strings, but for now, consistent behavior - is better than inconsistent behavior. [bug=980237] - -* Added the missing renderContents method from Beautiful Soup 3. Also - added an encode_contents() method to go along with decode_contents(). - -* Give a more useful error when the user tries to run the Python 2 - version of BS under Python 3. - -* UnicodeDammit can now convert Microsoft smart quotes to ASCII with - UnicodeDammit(markup, smart_quotes_to="ascii"). - -= 4.0.3 (20120403) = - -* Fixed a typo that caused some versions of Python 3 to convert the - Beautiful Soup codebase incorrectly. - -* Got rid of the 4.0.2 workaround for HTML documents--it was - unnecessary and the workaround was triggering a (possibly different, - but related) bug in lxml. [bug=972466] - -= 4.0.2 (20120326) = - -* Worked around a possible bug in lxml that prevents non-tiny XML - documents from being parsed. [bug=963880, bug=963936] - -* Fixed a bug where specifying `text` while also searching for a tag - only worked if `text` wanted an exact string match. [bug=955942] - -= 4.0.1 (20120314) = - -* This is the first official release of Beautiful Soup 4. There is no - 4.0.0 release, to eliminate any possibility that packaging software - might treat "4.0.0" as being an earlier version than "4.0.0b10". - -* Brought BS up to date with the latest release of soupselect, adding - CSS selector support for direct descendant matches and multiple CSS - class matches. - -= 4.0.0b10 (20120302) = - -* Added support for simple CSS selectors, taken from the soupselect project. - -* Fixed a crash when using html5lib. [bug=943246] - -* In HTML5-style <meta charset="foo"> tags, the value of the "charset" - attribute is now replaced with the appropriate encoding on - output. [bug=942714] - -* Fixed a bug that caused calling a tag to sometimes call find_all() - with the wrong arguments. [bug=944426] - -* For backwards compatibility, brought back the BeautifulStoneSoup - class as a deprecated wrapper around BeautifulSoup. - -= 4.0.0b9 (20120228) = - -* Fixed the string representation of DOCTYPEs that have both a public - ID and a system ID. - -* Fixed the generated XML declaration. - -* Renamed Tag.nsprefix to Tag.prefix, for consistency with - NamespacedAttribute. - -* Fixed a test failure that occured on Python 3.x when chardet was - installed. - -* Made prettify() return Unicode by default, so it will look nice on - Python 3 when passed into print(). - -= 4.0.0b8 (20120224) = - -* All tree builders now preserve namespace information in the - documents they parse. If you use the html5lib parser or lxml's XML - parser, you can access the namespace URL for a tag as tag.namespace. - - However, there is no special support for namespace-oriented - searching or tree manipulation. When you search the tree, you need - to use namespace prefixes exactly as they're used in the original - document. - -* The string representation of a DOCTYPE always ends in a newline. - -* Issue a warning if the user tries to use a SoupStrainer in - conjunction with the html5lib tree builder, which doesn't support - them. - -= 4.0.0b7 (20120223) = - -* Upon decoding to string, any characters that can't be represented in - your chosen encoding will be converted into numeric XML entity - references. - -* Issue a warning if characters were replaced with REPLACEMENT - CHARACTER during Unicode conversion. - -* Restored compatibility with Python 2.6. - -* The install process no longer installs docs or auxillary text files. - -* It's now possible to deepcopy a BeautifulSoup object created with - Python's built-in HTML parser. - -* About 100 unit tests that "test" the behavior of various parsers on - invalid markup have been removed. Legitimate changes to those - parsers caused these tests to fail, indicating that perhaps - Beautiful Soup should not test the behavior of foreign - libraries. - - The problematic unit tests have been reformulated as informational - comparisons generated by the script - scripts/demonstrate_parser_differences.py. - - This makes Beautiful Soup compatible with html5lib version 0.95 and - future versions of HTMLParser. - -= 4.0.0b6 (20120216) = - -* Multi-valued attributes like "class" always have a list of values, - even if there's only one value in the list. - -* Added a number of multi-valued attributes defined in HTML5. - -* Stopped generating a space before the slash that closes an - empty-element tag. This may come back if I add a special XHTML mode - (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty - useless. - -* Passing text along with tag-specific arguments to a find* method: - - find("a", text="Click here") - - will find tags that contain the given text as their - .string. Previously, the tag-specific arguments were ignored and - only strings were searched. - -* Fixed a bug that caused the html5lib tree builder to build a - partially disconnected tree. Generally cleaned up the html5lib tree - builder. - -* If you restrict a multi-valued attribute like "class" to a string - that contains spaces, Beautiful Soup will only consider it a match - if the values correspond to that specific string. - -= 4.0.0b5 (20120209) = - -* Rationalized Beautiful Soup's treatment of CSS class. A tag - belonging to multiple CSS classes is treated as having a list of - values for the 'class' attribute. Searching for a CSS class will - match *any* of the CSS classes. - - This actually affects all attributes that the HTML standard defines - as taking multiple values (class, rel, rev, archive, accept-charset, - and headers), but 'class' is by far the most common. [bug=41034] - -* If you pass anything other than a dictionary as the second argument - to one of the find* methods, it'll assume you want to use that - object to search against a tag's CSS classes. Previously this only - worked if you passed in a string. - -* Fixed a bug that caused a crash when you passed a dictionary as an - attribute value (possibly because you mistyped "attrs"). [bug=842419] - -* Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags - like <meta charset="utf-8" />. [bug=837268] - -* If Unicode, Dammit can't figure out a consistent encoding for a - page, it will try each of its guesses again, with errors="replace" - instead of errors="strict". This may mean that some data gets - replaced with REPLACEMENT CHARACTER, but at least most of it will - get turned into Unicode. [bug=754903] - -* Patched over a bug in html5lib (?) that was crashing Beautiful Soup - on certain kinds of markup. [bug=838800] - -* Fixed a bug that wrecked the tree if you replaced an element with an - empty string. [bug=728697] - -* Improved Unicode, Dammit's behavior when you give it Unicode to - begin with. - -= 4.0.0b4 (20120208) = - -* Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag() - -* BeautifulSoup.new_tag() will follow the rules of whatever - tree-builder was used to create the original BeautifulSoup object. A - new <p> tag will look like "<p />" if the soup object was created to - parse XML, but it will look like "<p></p>" if the soup object was - created to parse HTML. - -* We pass in strict=False to html.parser on Python 3, greatly - improving html.parser's ability to handle bad HTML. - -* We also monkeypatch a serious bug in html.parser that made - strict=False disastrous on Python 3.2.2. - -* Replaced the "substitute_html_entities" argument with the - more general "formatter" argument. - -* Bare ampersands and angle brackets are always converted to XML - entities unless the user prevents it. - -* Added PageElement.insert_before() and PageElement.insert_after(), - which let you put an element into the parse tree with respect to - some other element. - -* Raise an exception when the user tries to do something nonsensical - like insert a tag into itself. - - -= 4.0.0b3 (20120203) = - -Beautiful Soup 4 is a nearly-complete rewrite that removes Beautiful -Soup's custom HTML parser in favor of a system that lets you write a -little glue code and plug in any HTML or XML parser you want. - -Beautiful Soup 4.0 comes with glue code for four parsers: - - * Python's standard HTMLParser (html.parser in Python 3) - * lxml's HTML and XML parsers - * html5lib's HTML parser - -HTMLParser is the default, but I recommend you install lxml if you -can. - -For complete documentation, see the Sphinx documentation in -bs4/doc/source/. What follows is a summary of the changes from -Beautiful Soup 3. - -=== The module name has changed === - -Previously you imported the BeautifulSoup class from a module also -called BeautifulSoup. To save keystrokes and make it clear which -version of the API is in use, the module is now called 'bs4': - - >>> from bs4 import BeautifulSoup - -=== It works with Python 3 === - -Beautiful Soup 3.1.0 worked with Python 3, but the parser it used was -so bad that it barely worked at all. Beautiful Soup 4 works with -Python 3, and since its parser is pluggable, you don't sacrifice -quality. - -Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3 -support to the finish line. Ezio Melotti is also to thank for greatly -improving the HTML parser that comes with Python 3.2. - -=== CDATA sections are normal text, if they're understood at all. === - -Currently, the lxml and html5lib HTML parsers ignore CDATA sections in -markup: - - <p><![CDATA[foo]]></p> => <p></p> - -A future version of html5lib will turn CDATA sections into text nodes, -but only within tags like <svg> and <math>: - - <svg><![CDATA[foo]]></svg> => <p>foo</p> - -The default XML parser (which uses lxml behind the scenes) turns CDATA -sections into ordinary text elements: - - <p><![CDATA[foo]]></p> => <p>foo</p> - -In theory it's possible to preserve the CDATA sections when using the -XML parser, but I don't see how to get it to work in practice. - -=== Miscellaneous other stuff === - -If the BeautifulSoup instance has .is_xml set to True, an appropriate -XML declaration will be emitted when the tree is transformed into a -string: - - <?xml version="1.0" encoding="utf-8"> - <markup> - ... - </markup> - -The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree -builders set it to False. If you want to parse XHTML with an HTML -parser, you can set it manually. - - -= 3.2.0 = - -The 3.1 series wasn't very useful, so I renamed the 3.0 series to 3.2 -to make it obvious which one you should use. - -= 3.1.0 = - -A hybrid version that supports 2.4 and can be automatically converted -to run under Python 3.0. There are three backwards-incompatible -changes you should be aware of, but no new features or deliberate -behavior changes. - -1. str() may no longer do what you want. This is because the meaning -of str() inverts between Python 2 and 3; in Python 2 it gives you a -byte string, in Python 3 it gives you a Unicode string. - -The effect of this is that you can't pass an encoding to .__str__ -anymore. Use encode() to get a string and decode() to get Unicode, and -you'll be ready (well, readier) for Python 3. - -2. Beautiful Soup is now based on HTMLParser rather than SGMLParser, -which is gone in Python 3. There's some bad HTML that SGMLParser -handled but HTMLParser doesn't, usually to do with attribute values -that aren't closed or have brackets inside them: - - <a href="foo</a>, </a><a href="bar">baz</a> - <a b="<a>">', '<a b="<a>"></a><a>"></a> - -A later version of Beautiful Soup will allow you to plug in different -parsers to make tradeoffs between speed and the ability to handle bad -HTML. - -3. In Python 3 (but not Python 2), HTMLParser converts entities within -attributes to the corresponding Unicode characters. In Python 2 it's -possible to parse this string and leave the é intact. - - <a href="http://crummy.com?sacré&bleu"> - -In Python 3, the é is always converted to \xe9 during -parsing. - - -= 3.0.7a = - -Added an import that makes BS work in Python 2.3. - - -= 3.0.7 = - -Fixed a UnicodeDecodeError when unpickling documents that contain -non-ASCII characters. - -Fixed a TypeError that occured in some circumstances when a tag -contained no text. - -Jump through hoops to avoid the use of chardet, which can be extremely -slow in some circumstances. UTF-8 documents should never trigger the -use of chardet. - -Whitespace is preserved inside <pre> and <textarea> tags that contain -nothing but whitespace. - -Beautiful Soup can now parse a doctype that's scoped to an XML namespace. - - -= 3.0.6 = - -Got rid of a very old debug line that prevented chardet from working. - -Added a Tag.decompose() method that completely disconnects a tree or a -subset of a tree, breaking it up into bite-sized pieces that are -easy for the garbage collecter to collect. - -Tag.extract() now returns the tag that was extracted. - -Tag.findNext() now does something with the keyword arguments you pass -it instead of dropping them on the floor. - -Fixed a Unicode conversion bug. - -Fixed a bug that garbled some <meta> tags when rewriting them. - - -= 3.0.5 = - -Soup objects can now be pickled, and copied with copy.deepcopy. - -Tag.append now works properly on existing BS objects. (It wasn't -originally intended for outside use, but it can be now.) (Giles -Radford) - -Passing in a nonexistent encoding will no longer crash the parser on -Python 2.4 (John Nagle). - -Fixed an underlying bug in SGMLParser that thinks ASCII has 255 -characters instead of 127 (John Nagle). - -Entities are converted more consistently to Unicode characters. - -Entity references in attribute values are now converted to Unicode -characters when appropriate. Numeric entities are always converted, -because SGMLParser always converts them outside of attribute values. - -ALL_ENTITIES happens to just be the XHTML entities, so I renamed it to -XHTML_ENTITIES. - -The regular expression for bare ampersands was too loose. In some -cases ampersands were not being escaped. (Sam Ruby?) - -Non-breaking spaces and other special Unicode space characters are no -longer folded to ASCII spaces. (Robert Leftwich) - -Information inside a TEXTAREA tag is now parsed literally, not as HTML -tags. TEXTAREA now works exactly the same way as SCRIPT. (Zephyr Fang) - -= 3.0.4 = - -Fixed a bug that crashed Unicode conversion in some cases. - -Fixed a bug that prevented UnicodeDammit from being used as a -general-purpose data scrubber. - -Fixed some unit test failures when running against Python 2.5. - -When considering whether to convert smart quotes, UnicodeDammit now -looks at the original encoding in a case-insensitive way. - -= 3.0.3 (20060606) = - -Beautiful Soup is now usable as a way to clean up invalid XML/HTML (be -sure to pass in an appropriate value for convertEntities, or XML/HTML -entities might stick around that aren't valid in HTML/XML). The result -may not validate, but it should be good enough to not choke a -real-world XML parser. Specifically, the output of a properly -constructed soup object should always be valid as part of an XML -document, but parts may be missing if they were missing in the -original. As always, if the input is valid XML, the output will also -be valid. - -= 3.0.2 (20060602) = - -Previously, Beautiful Soup correctly handled attribute values that -contained embedded quotes (sometimes by escaping), but not other kinds -of XML character. Now, it correctly handles or escapes all special XML -characters in attribute values. - -I aliased methods to the 2.x names (fetch, find, findText, etc.) for -backwards compatibility purposes. Those names are deprecated and if I -ever do a 4.0 I will remove them. I will, I tell you! - -Fixed a bug where the findAll method wasn't passing along any keyword -arguments. - -When run from the command line, Beautiful Soup now acts as an HTML -pretty-printer, not an XML pretty-printer. - -= 3.0.1 (20060530) = - -Reintroduced the "fetch by CSS class" shortcut. I thought keyword -arguments would replace it, but they don't. You can't call soup('a', -class='foo') because class is a Python keyword. - -If Beautiful Soup encounters a meta tag that declares the encoding, -but a SoupStrainer tells it not to parse that tag, Beautiful Soup will -no longer try to rewrite the meta tag to mention the new -encoding. Basically, this makes SoupStrainers work in real-world -applications instead of crashing the parser. - -= 3.0.0 "Who would not give all else for two p" (20060528) = - -This release is not backward-compatible with previous releases. If -you've got code written with a previous version of the library, go -ahead and keep using it, unless one of the features mentioned here -really makes your life easier. Since the library is self-contained, -you can include an old copy of the library in your old applications, -and use the new version for everything else. - -The documentation has been rewritten and greatly expanded with many -more examples. - -Beautiful Soup autodetects the encoding of a document (or uses the one -you specify), and converts it from its native encoding to -Unicode. Internally, it only deals with Unicode strings. When you -print out the document, it converts to UTF-8 (or another encoding you -specify). [Doc reference] - -It's now easy to make large-scale changes to the parse tree without -screwing up the navigation members. The methods are extract, -replaceWith, and insert. [Doc reference. See also Improving Memory -Usage with extract] - -Passing True in as an attribute value gives you tags that have any -value for that attribute. You don't have to create a regular -expression. Passing None for an attribute value gives you tags that -don't have that attribute at all. - -Tag objects now know whether or not they're self-closing. This avoids -the problem where Beautiful Soup thought that tags like <BR /> were -self-closing even in XML documents. You can customize the self-closing -tags for a parser object by passing them in as a list of -selfClosingTags: you don't have to subclass anymore. - -There's a new built-in parser, MinimalSoup, which has most of -BeautifulSoup's HTML-specific rules, but no tag nesting rules. [Doc -reference] - -You can use a SoupStrainer to tell Beautiful Soup to parse only part -of a document. This saves time and memory, often making Beautiful Soup -about as fast as a custom-built SGMLParser subclass. [Doc reference, -SoupStrainer reference] - -You can (usually) use keyword arguments instead of passing a -dictionary of attributes to a search method. That is, you can replace -soup(args={"id" : "5"}) with soup(id="5"). You can still use args if -(for instance) you need to find an attribute whose name clashes with -the name of an argument to findAll. [Doc reference: **kwargs attrs] - -The method names have changed to the better method names used in -Rubyful Soup. Instead of find methods and fetch methods, there are -only find methods. Instead of a scheme where you can't remember which -method finds one element and which one finds them all, we have find -and findAll. In general, if the method name mentions All or a plural -noun (eg. findNextSiblings), then it finds many elements -method. Otherwise, it only finds one element. [Doc reference] - -Some of the argument names have been renamed for clarity. For instance -avoidParserProblems is now parserMassage. - -Beautiful Soup no longer implements a feed method. You need to pass a -string or a filehandle into the soup constructor, not with feed after -the soup has been created. There is still a feed method, but it's the -feed method implemented by SGMLParser and calling it will bypass -Beautiful Soup and cause problems. - -The NavigableText class has been renamed to NavigableString. There is -no NavigableUnicodeString anymore, because every string inside a -Beautiful Soup parse tree is a Unicode string. - -findText and fetchText are gone. Just pass a text argument into find -or findAll. - -Null was more trouble than it was worth, so I got rid of it. Anything -that used to return Null now returns None. - -Special XML constructs like comments and CDATA now have their own -NavigableString subclasses, instead of being treated as oddly-formed -data. If you parse a document that contains CDATA and write it back -out, the CDATA will still be there. - -When you're parsing a document, you can get Beautiful Soup to convert -XML or HTML entities into the corresponding Unicode characters. [Doc -reference] - -= 2.1.1 (20050918) = - -Fixed a serious performance bug in BeautifulStoneSoup which was -causing parsing to be incredibly slow. - -Corrected several entities that were previously being incorrectly -translated from Microsoft smart-quote-like characters. - -Fixed a bug that was breaking text fetch. - -Fixed a bug that crashed the parser when text chunks that look like -HTML tag names showed up within a SCRIPT tag. - -THEAD, TBODY, and TFOOT tags are now nestable within TABLE -tags. Nested tables should parse more sensibly now. - -BASE is now considered a self-closing tag. - -= 2.1.0 "Game, or any other dish?" (20050504) = - -Added a wide variety of new search methods which, given a starting -point inside the tree, follow a particular navigation member (like -nextSibling) over and over again, looking for Tag and NavigableText -objects that match certain criteria. The new methods are findNext, -fetchNext, findPrevious, fetchPrevious, findNextSibling, -fetchNextSiblings, findPreviousSibling, fetchPreviousSiblings, -findParent, and fetchParents. All of these use the same basic code -used by first and fetch, so you can pass your weird ways of matching -things into these methods. - -The fetch method and its derivatives now accept a limit argument. - -You can now pass keyword arguments when calling a Tag object as though -it were a method. - -Fixed a bug that caused all hand-created tags to share a single set of -attributes. - -= 2.0.3 (20050501) = - -Fixed Python 2.2 support for iterators. - -Fixed a bug that gave the wrong representation to tags within quote -tags like <script>. - -Took some code from Mark Pilgrim that treats CDATA declarations as -data instead of ignoring them. - -Beautiful Soup's setup.py will now do an install even if the unit -tests fail. It won't build a source distribution if the unit tests -fail, so I can't release a new version unless they pass. - -= 2.0.2 (20050416) = - -Added the unit tests in a separate module, and packaged it with -distutils. - -Fixed a bug that sometimes caused renderContents() to return a Unicode -string even if there was no Unicode in the original string. - -Added the done() method, which closes all of the parser's open -tags. It gets called automatically when you pass in some text to the -constructor of a parser class; otherwise you must call it yourself. - -Reinstated some backwards compatibility with 1.x versions: referencing -the string member of a NavigableText object returns the NavigableText -object instead of throwing an error. - -= 2.0.1 (20050412) = - -Fixed a bug that caused bad results when you tried to reference a tag -name shorter than 3 characters as a member of a Tag, eg. tag.table.td. - -Made sure all Tags have the 'hidden' attribute so that an attempt to -access tag.hidden doesn't spawn an attempt to find a tag named -'hidden'. - -Fixed a bug in the comparison operator. - -= 2.0.0 "Who cares for fish?" (20050410) - -Beautiful Soup version 1 was very useful but also pretty stupid. I -originally wrote it without noticing any of the problems inherent in -trying to build a parse tree out of ambiguous HTML tags. This version -solves all of those problems to my satisfaction. It also adds many new -clever things to make up for the removal of the stupid things. - -== Parsing == - -The parser logic has been greatly improved, and the BeautifulSoup -class should much more reliably yield a parse tree that looks like -what the page author intended. For a particular class of odd edge -cases that now causes problems, there is a new class, -ICantBelieveItsBeautifulSoup. - -By default, Beautiful Soup now performs some cleanup operations on -text before parsing it. This is to avoid common problems with bad -definitions and self-closing tags that crash SGMLParser. You can -provide your own set of cleanup operations, or turn it off -altogether. The cleanup operations include fixing self-closing tags -that don't close, and replacing Microsoft smart quotes and similar -characters with their HTML entity equivalents. - -You can now get a pretty-print version of parsed HTML to get a visual -picture of how Beautiful Soup parses it, with the Tag.prettify() -method. - -== Strings and Unicode == - -There are separate NavigableText subclasses for ASCII and Unicode -strings. These classes directly subclass the corresponding base data -types. This means you can treat NavigableText objects as strings -instead of having to call methods on them to get the strings. - -str() on a Tag always returns a string, and unicode() always returns -Unicode. Previously it was inconsistent. - -== Tree traversal == - -In a first() or fetch() call, the tag name or the desired value of an -attribute can now be any of the following: - - * A string (matches that specific tag or that specific attribute value) - * A list of strings (matches any tag or attribute value in the list) - * A compiled regular expression object (matches any tag or attribute - value that matches the regular expression) - * A callable object that takes the Tag object or attribute value as a - string. It returns None/false/empty string if the given string - doesn't match, and any other value if it does. - -This is much easier to use than SQL-style wildcards (see, regular -expressions are good for something). Because of this, I took out -SQL-style wildcards. I'll put them back if someone complains, but -their removal simplifies the code a lot. - -You can use fetch() and first() to search for text in the parse tree, -not just tags. There are new alias methods fetchText() and firstText() -designed for this purpose. As with searching for tags, you can pass in -a string, a regular expression object, or a method to match your text. - -If you pass in something besides a map to the attrs argument of -fetch() or first(), Beautiful Soup will assume you want to match that -thing against the "class" attribute. When you're scraping -well-structured HTML, this makes your code a lot cleaner. - -1.x and 2.x both let you call a Tag object as a shorthand for -fetch(). For instance, foo("bar") is a shorthand for -foo.fetch("bar"). In 2.x, you can also access a specially-named member -of a Tag object as a shorthand for first(). For instance, foo.barTag -is a shorthand for foo.first("bar"). By chaining these shortcuts you -traverse a tree in very little code: for header in -soup.bodyTag.pTag.tableTag('th'): - -If an element relationship (like parent or next) doesn't apply to a -tag, it'll now show up Null instead of None. first() will also return -Null if you ask it for a nonexistent tag. Null is an object that's -just like None, except you can do whatever you want to it and it'll -give you Null instead of throwing an error. - -This lets you do tree traversals like soup.htmlTag.headTag.titleTag -without having to worry if the intermediate stages are actually -there. Previously, if there was no 'head' tag in the document, headTag -in that instance would have been None, and accessing its 'titleTag' -member would have thrown an AttributeError. Now, you can get what you -want when it exists, and get Null when it doesn't, without having to -do a lot of conditionals checking to see if every stage is None. - -There are two new relations between page elements: previousSibling and -nextSibling. They reference the previous and next element at the same -level of the parse tree. For instance, if you have HTML like this: - - <p><ul><li>Foo<br /><li>Bar</ul> - -The first 'li' tag has a previousSibling of Null and its nextSibling -is the second 'li' tag. The second 'li' tag has a nextSibling of Null -and its previousSibling is the first 'li' tag. The previousSibling of -the 'ul' tag is the first 'p' tag. The nextSibling of 'Foo' is the -'br' tag. - -I took out the ability to use fetch() to find tags that have a -specific list of contents. See, I can't even explain it well. It was -really difficult to use, I never used it, and I don't think anyone -else ever used it. To the extent anyone did, they can probably use -fetchText() instead. If it turns out someone needs it I'll think of -another solution. - -== Tree manipulation == - -You can add new attributes to a tag, and delete attributes from a -tag. In 1.x you could only change a tag's existing attributes. - -== Porting Considerations == - -There are three changes in 2.0 that break old code: - -In the post-1.2 release you could pass in a function into fetch(). The -function took a string, the tag name. In 2.0, the function takes the -actual Tag object. - -It's no longer to pass in SQL-style wildcards to fetch(). Use a -regular expression instead. - -The different parsing algorithm means the parse tree may not be shaped -like you expect. This will only actually affect you if your code uses -one of the affected parts. I haven't run into this problem yet while -porting my code. - -= Between 1.2 and 2.0 = - -This is the release to get if you want Python 1.5 compatibility. - -The desired value of an attribute can now be any of the following: - - * A string - * A string with SQL-style wildcards - * A compiled RE object - * A callable that returns None/false/empty string if the given value - doesn't match, and any other value otherwise. - -This is much easier to use than SQL-style wildcards (see, regular -expressions are good for something). Because of this, I no longer -recommend you use SQL-style wildcards. They may go away in a future -release to clean up the code. - -Made Beautiful Soup handle processing instructions as text instead of -ignoring them. - -Applied patch from Richie Hindle (richie at entrian dot com) that -makes tag.string a shorthand for tag.contents[0].string when the tag -has only one string-owning child. - -Added still more nestable tags. The nestable tags thing won't work in -a lot of cases and needs to be rethought. - -Fixed an edge case where searching for "%foo" would match any string -shorter than "foo". - -= 1.2 "Who for such dainties would not stoop?" (20040708) = - -Applied patch from Ben Last (ben at benlast dot com) that made -Tag.renderContents() correctly handle Unicode. - -Made BeautifulStoneSoup even dumber by making it not implicitly close -a tag when another tag of the same type is encountered; only when an -actual closing tag is encountered. This change courtesy of Fuzzy (mike -at pcblokes dot com). BeautifulSoup still works as before. - -= 1.1 "Swimming in a hot tureen" = - -Added more 'nestable' tags. Changed popping semantics so that when a -nestable tag is encountered, tags are popped up to the previously -encountered nestable tag (of whatever kind). I will revert this if -enough people complain, but it should make more people's lives easier -than harder. This enhancement was suggested by Anthony Baxter (anthony -at interlink dot com dot au). - -= 1.0 "So rich and green" (20040420) = - -Initial release. diff --git a/libs/bs4/README.txt b/libs/bs4/README.txt deleted file mode 100644 index 305c51e05..000000000 --- a/libs/bs4/README.txt +++ /dev/null @@ -1,63 +0,0 @@ -= Introduction = - - >>> from bs4 import BeautifulSoup - >>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML") - >>> print soup.prettify() - <html> - <body> - <p> - Some - <b> - bad - <i> - HTML - </i> - </b> - </p> - </body> - </html> - >>> soup.find(text="bad") - u'bad' - - >>> soup.i - <i>HTML</i> - - >>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml") - >>> print soup.prettify() - <?xml version="1.0" encoding="utf-8"> - <tag1> - Some - <tag2 /> - bad - <tag3> - XML - </tag3> - </tag1> - -= Full documentation = - -The bs4/doc/ directory contains full documentation in Sphinx -format. Run "make html" in that directory to create HTML -documentation. - -= Running the unit tests = - -Beautiful Soup supports unit test discovery from the project root directory: - - $ nosetests - - $ python -m unittest discover -s bs4 # Python 2.7 and up - -If you checked out the source tree, you should see a script in the -home directory called test-all-versions. This script will run the unit -tests under Python 2.7, then create a temporary Python 3 conversion of -the source and run the unit tests again under Python 3. - -= Links = - -Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/ -Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ - http://readthedocs.org/docs/beautiful-soup-4/ -Discussion group: http://groups.google.com/group/beautifulsoup/ -Development: https://code.launchpad.net/beautifulsoup/ -Bug tracker: https://bugs.launchpad.net/beautifulsoup/ diff --git a/libs/bs4/TODO.txt b/libs/bs4/TODO.txt deleted file mode 100644 index e26d6264d..000000000 --- a/libs/bs4/TODO.txt +++ /dev/null @@ -1,31 +0,0 @@ -Additions ---------- - -More of the jQuery API: nextUntil? - -Optimizations -------------- - -The html5lib tree builder doesn't use the standard tree-building API, -which worries me and has resulted in a number of bugs. - -markup_attr_map can be optimized since it's always a map now. - -Upon encountering UTF-16LE data or some other uncommon serialization -of Unicode, UnicodeDammit will convert the data to Unicode, then -encode it at UTF-8. This is wasteful because it will just get decoded -back to Unicode. - -CDATA ------ - -The elementtree XMLParser has a strip_cdata argument that, when set to -False, should allow Beautiful Soup to preserve CDATA sections instead -of treating them as text. Except it doesn't. (This argument is also -present for HTMLParser, and also does nothing there.) - -Currently, htm5lib converts CDATA sections into comments. An -as-yet-unreleased version of html5lib changes the parser's handling of -CDATA sections to allow CDATA sections in tags like <svg> and -<math>. The HTML5TreeBuilder will need to be updated to create CData -objects instead of Comment objects in this situation. diff --git a/libs/bs4/__init__.py b/libs/bs4/__init__.py index 7a80452f7..95ca229c1 100644 --- a/libs/bs4/__init__.py +++ b/libs/bs4/__init__.py @@ -17,18 +17,17 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - __author__ = "Leonard Richardson ([email protected])" -__version__ = "4.6.0" -__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" +__version__ = "4.8.0" +__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" +# Use of this source code is governed by the MIT license. __license__ = "MIT" __all__ = ['BeautifulSoup'] import os import re +import sys import traceback import warnings @@ -50,7 +49,7 @@ from .element import ( # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. -'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ @@ -74,7 +73,7 @@ class BeautifulSoup(Tag): like HTML's <br> tag), call handle_starttag and then handle_endtag. """ - ROOT_TAG_NAME = u'[document]' + ROOT_TAG_NAME = '[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. @@ -82,16 +81,56 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, **kwargs): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser.""" + """Constructor. + + :param markup: A string or a file-like object representing + markup to be parsed. + + :param features: Desirable features of the parser to be used. This + may be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type of markup + to be used ("html", "html5", "xml"). It's recommended that you + name a specific parser, so that Beautiful Soup gives you the + same results across platforms and virtual environments. + + :param builder: A TreeBuilder subclass to instantiate (or + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. + + :param parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. + + :param from_encoding: A string indicating the encoding of the + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. + + :param exclude_encodings: A list of strings indicating + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. + + :param kwargs: For backwards compatibility purposes, the + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4; they will result in a warning and then be ignored. + + Apart from this, any keyword arguments passed into the BeautifulSoup + constructor are propagated to the TreeBuilder constructor. This + makes it possible to configure a TreeBuilder beyond saying + which one to use. + + """ if 'convertEntities' in kwargs: + del kwargs['convertEntities'] warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " @@ -142,18 +181,22 @@ class BeautifulSoup(Tag): from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") - if from_encoding and isinstance(markup, unicode): + if from_encoding and isinstance(markup, str): warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") from_encoding = None - if len(kwargs) > 0: - arg = kwargs.keys().pop() - raise TypeError( - "__init__() got an unexpected keyword argument '%s'" % arg) - - if builder is None: - original_features = features - if isinstance(features, basestring): + # We need this information to track whether or not the builder + # was specified well enough that we can omit the 'you need to + # specify a parser' warning. + original_builder = builder + original_features = features + + if isinstance(builder, type): + # A builder class was passed in; it needs to be instantiated. + builder_class = builder + builder = None + elif builder is None: + if isinstance(features, str): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES @@ -163,41 +206,73 @@ class BeautifulSoup(Tag): "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) - builder = builder_class() - if not (original_features == builder.NAME or - original_features in builder.ALTERNATE_NAMES): + + # At this point either we have a TreeBuilder instance in + # builder, or we have a builder_class that we can instantiate + # with the remaining **kwargs. + if builder is None: + builder = builder_class(**kwargs) + if not original_builder and not ( + original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES + ): if builder.is_xml: markup_type = "XML" else: markup_type = "HTML" - caller = traceback.extract_stack()[0] - filename = caller[0] - line_number = caller[1] - warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( - filename=filename, - line_number=line_number, - parser=builder.NAME, - markup_type=markup_type)) - + # This code adapted from warnings.py so that we get the same line + # of code as our warnings.warn() call gets, even if the answer is wrong + # (as it may be in a multithreading situation). + caller = None + try: + caller = sys._getframe(1) + except ValueError: + pass + if caller: + globals = caller.f_globals + line_number = caller.f_lineno + else: + globals = sys.__dict__ + line_number= 1 + filename = globals.get('__file__') + if filename: + fnl = filename.lower() + if fnl.endswith((".pyc", ".pyo")): + filename = filename[:-1] + if filename: + # If there is no filename at all, the user is most likely in a REPL, + # and the warning is not necessary. + values = dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type + ) + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) + else: + if kwargs: + warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") + self.builder = builder self.is_xml = builder.is_xml self.known_xml = self.is_xml - self.builder.soup = self - + self._namespaces = dict() self.parse_only = parse_only + self.builder.initialize_soup(self) + if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256 and ( (isinstance(markup, bytes) and not b'<' in markup) - or (isinstance(markup, unicode) and not u'<' in markup) + or (isinstance(markup, str) and not '<' in markup) ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. - if (isinstance(markup, unicode) + if (isinstance(markup, str) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: @@ -205,13 +280,13 @@ class BeautifulSoup(Tag): is_file = False try: is_file = os.path.exists(possible_filename) - except Exception, e: + except Exception as e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. pass if is_file: - if isinstance(markup, unicode): + if isinstance(markup, str): markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should' @@ -263,9 +338,9 @@ class BeautifulSoup(Tag): if isinstance(markup, bytes): space = b' ' cant_start_with = (b"http:", b"https:") - elif isinstance(markup, unicode): - space = u' ' - cant_start_with = (u"http:", u"https:") + elif isinstance(markup, str): + space = ' ' + cant_start_with = ("http:", "https:") else: return @@ -302,9 +377,10 @@ class BeautifulSoup(Tag): self.preserve_whitespace_tag_stack = [] self.pushTag(self) - def new_tag(self, name, namespace=None, nsprefix=None, **attrs): + def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs): """Create a new tag associated with this soup.""" - return Tag(None, self.builder, name, namespace, nsprefix, attrs) + kwattrs.update(attrs) + return Tag(None, self.builder, name, namespace, nsprefix, kwattrs) def new_string(self, s, subclass=NavigableString): """Create a new NavigableString associated with this soup.""" @@ -327,7 +403,7 @@ class BeautifulSoup(Tag): def pushTag(self, tag): #print "Push", tag.name - if self.currentTag: + if self.currentTag is not None: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] @@ -336,7 +412,7 @@ class BeautifulSoup(Tag): def endData(self, containerClass=NavigableString): if self.current_data: - current_data = u''.join(self.current_data) + current_data = ''.join(self.current_data) # If whitespace is not preserved, and this string contains # nothing but ASCII spaces, replace it with a single space # or newline. @@ -366,60 +442,71 @@ class BeautifulSoup(Tag): def object_was_parsed(self, o, parent=None, most_recent_element=None): """Add an object to the parse tree.""" - parent = parent or self.currentTag - previous_element = most_recent_element or self._most_recent_element + if parent is None: + parent = self.currentTag + if most_recent_element is not None: + previous_element = most_recent_element + else: + previous_element = self._most_recent_element next_element = previous_sibling = next_sibling = None if isinstance(o, Tag): next_element = o.next_element next_sibling = o.next_sibling previous_sibling = o.previous_sibling - if not previous_element: + if previous_element is None: previous_element = o.previous_element + fix = parent.next_element is not None + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) self._most_recent_element = o parent.contents.append(o) - if parent.next_sibling: - # This node is being inserted into an element that has - # already been parsed. Deal with any dangling references. - index = len(parent.contents)-1 - while index >= 0: - if parent.contents[index] is o: - break - index -= 1 - else: - raise ValueError( - "Error building tree: supposedly %r was inserted " - "into %r after the fact, but I don't see it!" % ( - o, parent - ) - ) - if index == 0: - previous_element = parent - previous_sibling = None - else: - previous_element = previous_sibling = parent.contents[index-1] - if index == len(parent.contents)-1: - next_element = parent.next_sibling - next_sibling = None - else: - next_element = next_sibling = parent.contents[index+1] - - o.previous_element = previous_element - if previous_element: - previous_element.next_element = o - o.next_element = next_element - if next_element: - next_element.previous_element = o - o.next_sibling = next_sibling - if next_sibling: - next_sibling.previous_sibling = o - o.previous_sibling = previous_sibling - if previous_sibling: - previous_sibling.next_sibling = o + # Check if we are inserting into an already parsed node. + if fix: + self._linkage_fixer(parent) + + def _linkage_fixer(self, el): + """Make sure linkage of this fragment is sound.""" + + first = el.contents[0] + child = el.contents[-1] + descendant = child + + if child is first and el.parent is not None: + # Parent should be linked to first child + el.next_element = child + # We are no longer linked to whatever this element is + prev_el = child.previous_element + if prev_el is not None and prev_el is not el: + prev_el.next_element = None + # First child should be linked to the parent, and no previous siblings. + child.previous_element = el + child.previous_sibling = None + + # We have no sibling as we've been appended as the last. + child.next_sibling = None + + # This index is a tag, dig deeper for a "last descendant" + if isinstance(child, Tag) and child.contents: + descendant = child._last_descendant(False) + + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. It should have no next sibling. + descendant.next_element = None + descendant.next_sibling = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + descendant.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent @@ -465,7 +552,7 @@ class BeautifulSoup(Tag): self.currentTag, self._most_recent_element) if tag is None: return tag - if self._most_recent_element: + if self._most_recent_element is not None: self._most_recent_element.next_element = tag self._most_recent_element = tag self.pushTag(tag) @@ -490,9 +577,9 @@ class BeautifulSoup(Tag): encoding_part = '' if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding - prefix = u'<?xml version="1.0"%s?>\n' % encoding_part + prefix = '<?xml version="1.0"%s?>\n' % encoding_part else: - prefix = u'' + prefix = '' if not pretty_print: indent_level = None else: @@ -526,4 +613,4 @@ class FeatureNotFound(ValueError): if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print soup.prettify() + print(soup.prettify()) diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py index fdb3362fc..cc497cf0b 100644 --- a/libs/bs4/builder/__init__.py +++ b/libs/bs4/builder/__init__.py @@ -1,5 +1,5 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" from collections import defaultdict import itertools @@ -7,8 +7,7 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, - HTMLAwareEntitySubstitution, - whitespace_re + nonwhitespace_re ) __all__ = [ @@ -90,18 +89,46 @@ class TreeBuilder(object): is_xml = False picklable = False - preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. - + # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. - cdata_list_attributes = {} - - - def __init__(self): + DEFAULT_CDATA_LIST_ATTRIBUTES = {} + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + USE_DEFAULT = object() + + def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): + """Constructor. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this do a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. + + :param preserve_whitespace_tags: + """ self.soup = None - + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + """ + self.soup = soup + def reset(self): pass @@ -125,7 +152,7 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - + def feed(self, markup): raise NotImplementedError() @@ -160,14 +187,14 @@ class TreeBuilder(object): universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), None) - for attr in attrs.keys(): + for attr in list(attrs.keys()): if attr in universal or (tag_specific and attr in tag_specific): # We have a "class"-type attribute whose string # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] - if isinstance(value, basestring): - values = whitespace_re.split(value) + if isinstance(value, str): + values = nonwhitespace_re.findall(value) else: # html5lib sometimes calls setAttributes twice # for the same tag when rearranging the parse @@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set([ # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - - # These are from HTML4, removed in HTML5. - 'spacer', 'frame' + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' ]) + # The HTML standard defines these as block-level elements. Beautiful + # Soup does not treat these elements differently from other elements, + # but it may do so eventually, and this information is available if + # you need to use it. + block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, # class="foo bar" means that the 'class' attribute has two values, @@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder): # encounter one of these attributes, we will parse its value into # a list of values if possible. Upon output, the list will be # converted back into a string. - cdata_list_attributes = { + DEFAULT_CDATA_LIST_ATTRIBUTES = { "*" : ['class', 'accesskey', 'dropzone'], "a" : ['rel', 'rev'], "link" : ['rel', 'rev'], @@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder): "output" : ["for"], } + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py index 5f5489358..090bb61a8 100644 --- a/libs/bs4/builder/_html5lib.py +++ b/libs/bs4/builder/_html5lib.py @@ -1,5 +1,5 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" __all__ = [ 'HTML5TreeBuilder', @@ -15,7 +15,7 @@ from bs4.builder import ( ) from bs4.element import ( NamespacedAttribute, - whitespace_re, + nonwhitespace_re, ) import html5lib from html5lib.constants import ( @@ -33,7 +33,7 @@ try: # Pre-0.99999999 from html5lib.treebuilders import _base as treebuilder_base new_html5lib = False -except ImportError, e: +except ImportError as e: # 0.99999999 and up from html5lib.treebuilders import base as treebuilder_base new_html5lib = True @@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() - if not isinstance(markup, unicode): + if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: @@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): + if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] - if not isinstance(original_encoding, basestring): + if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. @@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><head></head><body>%s</body></html>' % fragment + return '<html><head></head><body>%s</body></html>' % fragment class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): @@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): rv.append("|%s<%s>" % (' ' * indent, name)) if element.attrs: attributes = [] - for name, value in element.attrs.items(): + for name, value in list(element.attrs.items()): if isinstance(name, NamespacedAttribute): name = "%s %s" % (prefixes[name.namespace], name.name) if isinstance(value, list): @@ -199,14 +199,14 @@ class AttrList(object): def __setitem__(self, name, value): # If this attribute is a multi-valued attribute for this element, # turn its value into a list. - list_attr = HTML5TreeBuilder.cdata_list_attributes + list_attr = self.element.cdata_list_attributes if (name in list_attr['*'] or (self.element.name in list_attr and name in list_attr[self.element.name])): # A node that is being cloned may have already undergone # this procedure. if not isinstance(value, list): - value = whitespace_re.split(value) + value = nonwhitespace_re.findall(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -229,7 +229,7 @@ class Element(treebuilder_base.Node): def appendChild(self, node): string_child = child = None - if isinstance(node, basestring): + if isinstance(node, str): # Some other piece of code decided to pass in a string # instead of creating a TextElement object to contain the # string. @@ -246,10 +246,10 @@ class Element(treebuilder_base.Node): child = node.element node.parent = self - if not isinstance(child, basestring) and child.parent is not None: + if not isinstance(child, str) and child.parent is not None: node.element.extract() - if (string_child and self.element.contents + if (string_child is not None and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # We are appending a string onto another string. # TODO This has O(n^2) performance, for input like @@ -259,7 +259,7 @@ class Element(treebuilder_base.Node): old_element.replace_with(new_element) self.soup._most_recent_element = new_element else: - if isinstance(node, basestring): + if isinstance(node, str): # Create a brand new NavigableString from this string. child = self.soup.new_string(node) @@ -299,7 +299,7 @@ class Element(treebuilder_base.Node): self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) - for name, value in attributes.items(): + for name, value in list(attributes.items()): self.element[name] = value # The attributes may contain variables that need substitution. @@ -360,16 +360,16 @@ class Element(treebuilder_base.Node): # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - if new_parents_last_descendant: + if new_parents_last_descendant is not None: first_child.previous_element = new_parents_last_descendant else: first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child - if new_parents_last_descendant: + if new_parents_last_descendant is not None: new_parents_last_descendant.next_element = first_child else: new_parent_element.next_element = first_child - if new_parents_last_child: + if new_parents_last_child is not None: new_parents_last_child.next_sibling = first_child # Find the very last element being moved. It is now the @@ -379,7 +379,7 @@ class Element(treebuilder_base.Node): last_childs_last_descendant = to_append[-1]._last_descendant(False, True) last_childs_last_descendant.next_element = new_parents_last_descendant_next_element - if new_parents_last_descendant_next_element: + if new_parents_last_descendant_next_element is not None: # TODO: This code has no test coverage and I'm not sure # how to get html5lib to go through this path, but it's # just the other side of the previous line. diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py index 67890b3a3..ea549c356 100644 --- a/libs/bs4/builder/_htmlparser.py +++ b/libs/bs4/builder/_htmlparser.py @@ -1,17 +1,18 @@ +# encoding: utf-8 """Use the HTMLParser library to parse HTML files that aren't too bad.""" -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import HTMLParser +from html.parser import HTMLParser try: - from HTMLParser import HTMLParseError -except ImportError, e: + from html.parser import HTMLParseError +except ImportError as e: # HTMLParseError is removed in Python 3.5. Since it can never be # thrown in 3.5, we can just define our own class as a placeholder. class HTMLParseError(Exception): @@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser): # order. It's a list of closing tags we've already handled and # will ignore, assuming they ever show up. self.already_closed_empty_element = [] - + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although this + requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() as raising an exception. + + In any event, this method is called only on very strange markup and our best strategy + is to pretend it didn't happen and keep going. + """ + warnings.warn(msg) + def handle_startendtag(self, name, attrs): # This is only called when the markup looks like # <tag/>. @@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser): else: real_name = int(name) - try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - data = u"\N{REPLACEMENT CHARACTER}" - + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError as e: + pass + if not data: + try: + data = chr(real_name) + except (ValueError, OverflowError) as e: + pass + data = data or "\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): @@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser): if character is not None: data = character else: - data = "&%s;" % name + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name self.handle_data(data) def handle_comment(self, data): @@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): NAME = HTMLPARSER features = [NAME, HTML, STRICT] - def __init__(self, *args, **kwargs): + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - kwargs['strict'] = False + parser_kwargs['strict'] = False if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - kwargs['convert_charrefs'] = False - self.parser_args = (args, kwargs) + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): @@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ - if isinstance(markup, unicode): + if isinstance(markup, str): yield (markup, None, None, False) return @@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) - except HTMLParseError, e: + parser.close() + except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py index d2ca2872d..a490e2301 100644 --- a/libs/bs4/builder/_lxml.py +++ b/libs/bs4/builder/_lxml.py @@ -1,13 +1,18 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', ] +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable + from io import BytesIO -from StringIO import StringIO -import collections +from io import StringIO from lxml import etree from bs4.element import ( Comment, @@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector LXML = 'lxml' +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in list(d.items())) + class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser @@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder): # This namespace mapping is specified in the XML Namespace # standard. - DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + """ + for key, value in list(mapping.items()): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value def default_parser(self, encoding): # This can either return a parser object or a class, which @@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Use the default parser. parser = self.default_parser(encoding) - if isinstance(parser, collections.Callable): + if isinstance(parser, Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False, encoding=encoding) return parser - def __init__(self, parser=None, empty_element_tags=None): + def __init__(self, parser=None, empty_element_tags=None, **kwargs): # TODO: Issue a warning if parser is present but not a # callable, since that means there's no way to create new # parsers for different encodings. @@ -71,8 +102,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) self.soup = None - self.nsmaps = [self.DEFAULT_NSMAPS] - + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) + def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag # name. Copied from lxml's src/lxml/sax.py. @@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): else: self.processing_instruction_class = XMLProcessingInstruction - if isinstance(markup, unicode): + if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False - if isinstance(markup, unicode): + if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", @@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) - elif isinstance(markup, unicode): + elif isinstance(markup, str): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, @@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def close(self): - self.nsmaps = [self.DEFAULT_NSMAPS] + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] def start(self, name, attrs, nsmap={}): # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. attrs = dict(attrs) nsprefix = None # Invert each namespace map as it comes in. - if len(self.nsmaps) > 1: - # There are no new namespaces for this tag, but - # non-default namespaces are in play, so we need a - # separate tag stack to know when they end. - self.nsmaps.append(None) + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. - inverted_nsmap = dict((value, key) for key, value in nsmap.items()) - self.nsmaps.append(inverted_nsmap) + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() - for prefix, namespace in nsmap.items(): + for prefix, namespace in list(nsmap.items()): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace @@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # from lxml with namespaces attached to their names, and # turn then into NamespacedAttribute objects. new_attrs = {} - for attr, value in attrs.items(): + for attr, value in list(attrs.items()): namespace, attr = self._getNsTag(attr) if namespace is None: new_attrs[attr] = value @@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment + return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): @@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><body>%s</body></html>' % fragment + return '<html><body>%s</body></html>' % fragment diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py index 7965565f5..c7ac4d431 100644 --- a/libs/bs4/dammit.py +++ b/libs/bs4/dammit.py @@ -6,12 +6,11 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" import codecs -from htmlentitydefs import codepoint2name +from html.entities import codepoint2name import re import logging import string @@ -46,9 +45,9 @@ except ImportError: pass xml_encoding_re = re.compile( - '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) + '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) html_meta_re = re.compile( - '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): @@ -58,15 +57,24 @@ class EntitySubstitution(object): lookup = {} reverse_lookup = {} characters_for_re = [] - for codepoint, name in list(codepoint2name.items()): - character = unichr(codepoint) - if codepoint != 34: + + # &apos is an XHTML entity and an HTML 5, but not an HTML 4 + # entity. We don't want to use it, but we want to recognize it on the way in. + # + # TODO: Ideally we would be able to recognize all HTML 5 named + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: + character = chr(codepoint) + if codepoint not in (34, 39): # There's no point in turning the quotation mark into - # ", unless it happens within an attribute value, which - # is handled elsewhere. + # " or the single quote into ', unless it + # happens within an attribute value, which is handled + # elsewhere. characters_for_re.append(character) lookup[character] = name - # But we do want to turn " into the quotation mark. + # But we do want to recognize those entities on the way in and + # convert them to Unicode characters. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition) @@ -82,7 +90,7 @@ class EntitySubstitution(object): } BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")") AMPERSAND_OR_BRACKET = re.compile("([<>&])") @@ -274,7 +282,7 @@ class EncodingDetector: def strip_byte_order_mark(cls, data): """If a byte-order mark is present, strip it and return the encoding it implies.""" encoding = None - if isinstance(data, unicode): + if isinstance(data, str): # Unicode data cannot have a byte-order mark. return data, encoding if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ @@ -352,9 +360,9 @@ class UnicodeDammit: markup, override_encodings, is_html, exclude_encodings) # Short-circuit if the data is in Unicode to begin with. - if isinstance(markup, unicode) or markup == '': + if isinstance(markup, str) or markup == '': self.markup = markup - self.unicode_markup = unicode(markup) + self.unicode_markup = str(markup) self.original_encoding = None return @@ -438,7 +446,7 @@ class UnicodeDammit: def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' - return unicode(data, encoding, errors) + return str(data, encoding, errors) @property def declared_html_encoding(self): diff --git a/libs/bs4/diagnose.py b/libs/bs4/diagnose.py index 8768332f5..b5f6e6c8b 100644 --- a/libs/bs4/diagnose.py +++ b/libs/bs4/diagnose.py @@ -1,12 +1,11 @@ """Diagnostic functions, mainly for use when doing tech support.""" -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" import cProfile -from StringIO import StringIO -from HTMLParser import HTMLParser +from io import StringIO +from html.parser import HTMLParser import bs4 from bs4 import BeautifulSoup, __version__ from bs4.builder import builder_registry @@ -22,8 +21,8 @@ import cProfile def diagnose(data): """Diagnostic suite for isolating common problems.""" - print "Diagnostic running on Beautiful Soup %s" % __version__ - print "Python version %s" % sys.version + print("Diagnostic running on Beautiful Soup %s" % __version__) + print("Python version %s" % sys.version) basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: @@ -32,16 +31,16 @@ def diagnose(data): break else: basic_parsers.remove(name) - print ( + print(( "I noticed that %s is not installed. Installing it may help." % - name) + name)) if 'lxml' in basic_parsers: - basic_parsers.append(["lxml", "xml"]) + basic_parsers.append("lxml-xml") try: from lxml import etree - print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) - except ImportError, e: + print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) + except ImportError as e: print ( "lxml is not installed or couldn't be imported.") @@ -49,37 +48,43 @@ def diagnose(data): if 'html5lib' in basic_parsers: try: import html5lib - print "Found html5lib version %s" % html5lib.__version__ - except ImportError, e: + print("Found html5lib version %s" % html5lib.__version__) + except ImportError as e: print ( "html5lib is not installed or couldn't be imported.") if hasattr(data, 'read'): data = data.read() - elif os.path.exists(data): - print '"%s" looks like a filename. Reading data from the file.' % data - with open(data) as fp: - data = fp.read() elif data.startswith("http:") or data.startswith("https:"): - print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data - print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." + print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") return - print + else: + try: + if os.path.exists(data): + print('"%s" looks like a filename. Reading data from the file.' % data) + with open(data) as fp: + data = fp.read() + except ValueError: + # This can happen on some platforms when the 'filename' is + # too long. Assume it's data and not a filename. + pass + print() for parser in basic_parsers: - print "Trying to parse your markup with %s" % parser + print("Trying to parse your markup with %s" % parser) success = False try: - soup = BeautifulSoup(data, parser) + soup = BeautifulSoup(data, features=parser) success = True - except Exception, e: - print "%s could not parse the markup." % parser + except Exception as e: + print("%s could not parse the markup." % parser) traceback.print_exc() if success: - print "Here's what %s did with the markup:" % parser - print soup.prettify() + print("Here's what %s did with the markup:" % parser) + print(soup.prettify()) - print "-" * 80 + print("-" * 80) def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. @@ -89,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs): """ from lxml import etree for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): - print("%s, %4s, %s" % (event, element.tag, element.text)) + print(("%s, %4s, %s" % (event, element.tag, element.text))) class AnnouncingParser(HTMLParser): """Announces HTMLParser parse events, without doing anything else.""" @@ -149,7 +154,7 @@ def rword(length=5): def rsentence(length=4): "Generate a random sentence-like string." - return " ".join(rword(random.randint(4,9)) for i in range(length)) + return " ".join(rword(random.randint(4,9)) for i in list(range(length))) def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" @@ -171,9 +176,9 @@ def rdoc(num_elements=1000): def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" - print "Comparative parser benchmark on Beautiful Soup %s" % __version__ + print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) - print "Generated a large invalid HTML document (%d bytes)." % len(data) + print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False @@ -182,24 +187,24 @@ def benchmark_parsers(num_elements=100000): soup = BeautifulSoup(data, parser) b = time.time() success = True - except Exception, e: - print "%s could not parse the markup." % parser + except Exception as e: + print("%s could not parse the markup." % parser) traceback.print_exc() if success: - print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) + print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() - print "Raw lxml parsed the markup in %.2fs." % (b-a) + print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() - print "Raw html5lib parsed the markup in %.2fs." % (b-a) + print("Raw html5lib parsed the markup in %.2fs." % (b-a)) def profile(num_elements=100000, parser="lxml"): diff --git a/libs/bs4/element.py b/libs/bs4/element.py index 9ef75f814..f16b1663e 100644 --- a/libs/bs4/element.py +++ b/libs/bs4/element.py @@ -1,18 +1,35 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" -import collections +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable import re -import shlex import sys import warnings -from bs4.dammit import EntitySubstitution +try: + import soupsieve +except ImportError as e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) -whitespace_re = re.compile("\s+") +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. +whitespace_re = re.compile(r"\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @@ -26,22 +43,22 @@ def _alias(attr): return alias -class NamespacedAttribute(unicode): +class NamespacedAttribute(str): def __new__(cls, prefix, name, namespace=None): if name is None: - obj = unicode.__new__(cls, prefix) + obj = str.__new__(cls, prefix) elif prefix is None: # Not really namespaced. - obj = unicode.__new__(cls, name) + obj = str.__new__(cls, name) else: - obj = unicode.__new__(cls, prefix + ":" + name) + obj = str.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj -class AttributeValueWithCharsetSubstitution(unicode): +class AttributeValueWithCharsetSubstitution(str): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): @@ -52,7 +69,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """ def __new__(cls, original_value): - obj = unicode.__new__(cls, original_value) + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -69,15 +86,15 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): The value of the 'content' attribute will be one of these objects. """ - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. - return unicode.__new__(unicode, original_value) + return str.__new__(str, original_value) - obj = unicode.__new__(cls, original_value) + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -86,94 +103,71 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) -class HTMLAwareEntitySubstitution(EntitySubstitution): - - """Entity substitution rules that are aware of some HTML quirks. - - Specifically, the contents of <script> and <style> tags should not - undergo entity substitution. - - Incoming NavigableString objects are checked to see if they're the - direct children of a <script> or <style> tag. - """ - - cdata_containing_tags = set(["script", "style"]) - - preformatted_tags = set(["pre"]) + +class PageElement(object): + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent - preserve_whitespace_tags = set(['pre', 'textarea']) + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self - @classmethod - def _substitute_if_appropriate(cls, ns, f): - if (isinstance(ns, NavigableString) - and ns.parent is not None - and ns.parent.name in cls.cdata_containing_tags): - # Do nothing. - return ns - # Substitute. - return f(ns) + self.next_element = next_element + if self.next_element is not None: + self.next_element.previous_element = self - @classmethod - def substitute_html(cls, ns): - return cls._substitute_if_appropriate( - ns, EntitySubstitution.substitute_html) + self.next_sibling = next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self - @classmethod - def substitute_xml(cls, ns): - return cls._substitute_if_appropriate( - ns, EntitySubstitution.substitute_xml) + if (previous_sibling is None + and self.parent is not None and self.parent.contents): + previous_sibling = self.parent.contents[-1] -class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" + self.previous_sibling = previous_sibling + if previous_sibling is not None: + self.previous_sibling.next_sibling = self - # There are five possible values for the "formatter" argument passed in - # to methods like encode() and prettify(): - # - # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to - # XML entities: & < > - # None - The null formatter. Unicode characters are never - # converted to entities. This is not recommended, but it's - # faster than "minimal". - # A function - This function will be called on every string that - # needs to undergo entity substitution. - # - - # In an HTML document, the default "html" and "minimal" functions - # will leave the contents of <script> and <style> tags alone. For - # an XML document, all tags will be given the same treatment. - - HTML_FORMATTERS = { - "html" : HTMLAwareEntitySubstitution.substitute_html, - "minimal" : HTMLAwareEntitySubstitution.substitute_xml, - None : None - } - - XML_FORMATTERS = { - "html" : EntitySubstitution.substitute_html, - "minimal" : EntitySubstitution.substitute_xml, - None : None - } - - def format_string(self, s, formatter='minimal'): + def format_string(self, s, formatter): """Format the given string using the given formatter.""" - if not callable(formatter): - formatter = self._formatter_for_name(formatter) if formatter is None: - output = s - else: - output = formatter(s) + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) return output + def formatter_for_name(self, formatter): + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a Formatter object (used as-is), a + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look up + an XMLFormatter or HTMLFormatter in the appropriate registry. + """ + if isinstance(formatter, Formatter): + return formatter + if self._is_xml: + c = XMLFormatter + else: + c = HTMLFormatter + if callable(formatter): + return c(entity_substitution=formatter) + return c.REGISTRY[formatter] + @property def _is_xml(self): """Is this element part of an XML tree or an HTML tree? - This is used when mapping a formatter name ("minimal") to an - appropriate function (one that performs entity-substitution on - the contents of <script> and <style> tags, or not). It can be + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be inefficient, but it should be called very rarely. """ if self.known_xml is not None: @@ -191,48 +185,13 @@ class PageElement(object): return getattr(self, 'is_xml', False) return self.parent._is_xml - def _formatter_for_name(self, name): - "Look up a formatter function based on its name and the tree." - if self._is_xml: - return self.XML_FORMATTERS.get( - name, EntitySubstitution.substitute_xml) - else: - return self.HTML_FORMATTERS.get( - name, HTMLAwareEntitySubstitution.substitute_xml) - - def setup(self, parent=None, previous_element=None, next_element=None, - previous_sibling=None, next_sibling=None): - """Sets up the initial relations between this element and - other elements.""" - self.parent = parent - - self.previous_element = previous_element - if previous_element is not None: - self.previous_element.next_element = self - - self.next_element = next_element - if self.next_element: - self.next_element.previous_element = self - - self.next_sibling = next_sibling - if self.next_sibling: - self.next_sibling.previous_sibling = self - - if (not previous_sibling - and self.parent is not None and self.parent.contents): - previous_sibling = self.parent.contents[-1] - - self.previous_sibling = previous_sibling - if previous_sibling: - self.previous_sibling.next_sibling = self - nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): - if not self.parent: + if self.parent is None: raise ValueError( - "Cannot replace one element with another when the" + "Cannot replace one element with another when the " "element to be replaced is not part of a tree.") if replace_with is self: return @@ -247,7 +206,7 @@ class PageElement(object): def unwrap(self): my_parent = self.parent - if not self.parent: + if self.parent is None: raise ValueError( "Cannot replace an element with its contents when that" "element is not part of a tree.") @@ -295,7 +254,7 @@ class PageElement(object): def _last_descendant(self, is_initialized=True, accept_self=True): "Finds the last element beneath this object to be parsed." - if is_initialized and self.next_sibling: + if is_initialized and self.next_sibling is not None: last_child = self.next_sibling.previous_element else: last_child = self @@ -312,10 +271,18 @@ class PageElement(object): raise ValueError("Cannot insert None into a tag.") if new_child is self: raise ValueError("Cannot insert a tag into itself.") - if (isinstance(new_child, basestring) + if (isinstance(new_child, str) and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one @@ -377,43 +344,54 @@ class PageElement(object): """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) - def insert_before(self, predecessor): - """Makes the given element the immediate predecessor of this one. + def extend(self, tags): + """Appends the given tags to the contents of this tag.""" + for tag in tags: + self.append(tag) - The two elements will have the same parent, and the given element + def insert_before(self, *args): + """Makes the given element(s) the immediate predecessor of this one. + + The elements will have the same parent, and the given elements will be immediately before this one. """ - if self is predecessor: - raise ValueError("Can't insert an element before itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'before' has no meaning.") - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(predecessor, PageElement): - predecessor.extract() - index = parent.index(self) - parent.insert(index, predecessor) - - def insert_after(self, successor): - """Makes the given element the immediate successor of this one. - - The two elements will have the same parent, and the given element + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, *args): + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same parent, and the given elements will be immediately after this one. """ - if self is successor: - raise ValueError("Can't insert an element after itself.") + # Do all error checking before modifying the tree. parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'after' has no meaning.") - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(successor, PageElement): - successor.extract() - index = parent.index(self) - parent.insert(index+1, successor) + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1+offset, successor) + offset += 1 def find_next(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and @@ -533,17 +511,24 @@ class PageElement(object): result = (element for element in generator if isinstance(element, Tag)) return ResultSet(strainer, result) - elif isinstance(name, basestring): + elif isinstance(name, str): # Optimization to find all tags with a given name. if name.count(':') == 1: - # This is a name with a prefix. - prefix, name = name.split(':', 1) + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(':', 1) else: prefix = None + local_name = name result = (element for element in generator if isinstance(element, Tag) - and element.name == name - and (prefix is None or element.prefix == prefix) + and ( + element.name == name + ) or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ) ) return ResultSet(strainer, result) results = ResultSet(strainer) @@ -597,82 +582,6 @@ class PageElement(object): yield i i = i.parent - # Methods for supporting CSS selectors. - - tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') - - # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ - # \---------------------------/ \---/\-------------/ \-------/ - # | | | | - # | | | The value - # | | ~,|,^,$,* or = - # | Attribute - # Tag - attribselect_re = re.compile( - r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' + - r'=?"?(?P<value>[^\]"]*)"?\]$' - ) - - def _attr_value_as_string(self, value, default=None): - """Force an attribute value into a string representation. - - A multi-valued attribute will be converted into a - space-separated stirng. - """ - value = self.get(value, default) - if isinstance(value, list) or isinstance(value, tuple): - value =" ".join(value) - return value - - def _tag_name_matches_and(self, function, tag_name): - if not tag_name: - return function - else: - def _match(tag): - return tag.name == tag_name and function(tag) - return _match - - def _attribute_checker(self, operator, attribute, value=''): - """Create a function that performs a CSS selector operation. - - Takes an operator, attribute and optional value. Returns a - function that will return True for elements that match that - combination. - """ - if operator == '=': - # string representation of `attribute` is equal to `value` - return lambda el: el._attr_value_as_string(attribute) == value - elif operator == '~': - # space-separated list representation of `attribute` - # contains `value` - def _includes_value(element): - attribute_value = element.get(attribute, []) - if not isinstance(attribute_value, list): - attribute_value = attribute_value.split() - return value in attribute_value - return _includes_value - elif operator == '^': - # string representation of `attribute` starts with `value` - return lambda el: el._attr_value_as_string( - attribute, '').startswith(value) - elif operator == '$': - # string representation of `attribute` ends with `value` - return lambda el: el._attr_value_as_string( - attribute, '').endswith(value) - elif operator == '*': - # string representation of `attribute` contains `value` - return lambda el: value in el._attr_value_as_string(attribute, '') - elif operator == '|': - # string representation of `attribute` is either exactly - # `value` or starts with `value` and then a dash. - def _is_or_starts_with_dash(element): - attribute_value = element._attr_value_as_string(attribute, '') - return (attribute_value == value or attribute_value.startswith( - value + '-')) - return _is_or_starts_with_dash - else: - return lambda el: el.has_attr(attribute) - # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): @@ -691,7 +600,7 @@ class PageElement(object): return self.parents -class NavigableString(unicode, PageElement): +class NavigableString(str, PageElement): PREFIX = '' SUFFIX = '' @@ -709,10 +618,10 @@ class NavigableString(unicode, PageElement): passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ - if isinstance(value, unicode): - u = unicode.__new__(cls, value) + if isinstance(value, str): + u = str.__new__(cls, value) else: - u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) u.setup() return u @@ -723,7 +632,7 @@ class NavigableString(unicode, PageElement): return type(self)(self) def __getnewargs__(self): - return (unicode(self),) + return (str(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -737,6 +646,7 @@ class NavigableString(unicode, PageElement): self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): + """Run the string through the provided formatter.""" output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX @@ -755,37 +665,39 @@ class PreformattedString(NavigableString): but the return value will be ignored. """ - def output_ready(self, formatter="minimal"): - """CData strings are passed into the formatter. - But the return value is ignored.""" - self.format_string(self, formatter) + def output_ready(self, formatter=None): + """CData strings are passed into the formatter, purely + for any side effects. The return value is ignored. + """ + if formatter is not None: + ignore = self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX class CData(PreformattedString): - PREFIX = u'<![CDATA[' - SUFFIX = u']]>' + PREFIX = '<![CDATA[' + SUFFIX = ']]>' class ProcessingInstruction(PreformattedString): """A SGML processing instruction.""" - PREFIX = u'<?' - SUFFIX = u'>' + PREFIX = '<?' + SUFFIX = '>' class XMLProcessingInstruction(ProcessingInstruction): """An XML processing instruction.""" - PREFIX = u'<?' - SUFFIX = u'?>' + PREFIX = '<?' + SUFFIX = '?>' class Comment(PreformattedString): - PREFIX = u'<!--' - SUFFIX = u'-->' + PREFIX = '<!--' + SUFFIX = '-->' class Declaration(PreformattedString): - PREFIX = u'<?' - SUFFIX = u'?>' + PREFIX = '<?' + SUFFIX = '?>' class Doctype(PreformattedString): @@ -802,8 +714,8 @@ class Doctype(PreformattedString): return Doctype(value) - PREFIX = u'<!DOCTYPE ' - SUFFIX = u'>\n' + PREFIX = '<!DOCTYPE ' + SUFFIX = '>\n' class Tag(PageElement): @@ -826,14 +738,6 @@ class Tag(PageElement): self.name = name self.namespace = namespace self.prefix = prefix - if builder is not None: - preserve_whitespace_tags = builder.preserve_whitespace_tags - else: - if is_xml: - preserve_whitespace_tags = [] - else: - preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags - self.preserve_whitespace_tags = preserve_whitespace_tags if attrs is None: attrs = {} elif attrs: @@ -856,13 +760,32 @@ class Tag(PageElement): self.setup(parent, previous) self.hidden = False - # Set up any substitutions, such as the charset in a META tag. - if builder is not None: + if builder is None: + # In the absence of a TreeBuilder, assume this tag is nothing + # special. + self.can_be_empty_element = False + self.cdata_list_attributes = None + else: + # Set up any substitutions for this tag, such as the charset in a META tag. builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. self.can_be_empty_element = builder.can_be_empty_element(name) - else: - self.can_be_empty_element = False + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags + parserClass = _alias("parser_class") # BS3 def __copy__(self): @@ -942,7 +865,7 @@ class Tag(PageElement): for string in self._all_strings(True): yield string - def get_text(self, separator=u"", strip=False, + def get_text(self, separator="", strip=False, types=(NavigableString, CData)): """ Get all child strings, concatenated using the given separator. @@ -976,6 +899,43 @@ class Tag(PageElement): for element in self.contents[:]: element.extract() + def smooth(self): + """Smooth out this element's children by consolidating consecutive strings. + + This makes pretty-printed output look more natural following a + lot of operations that modified the tree. + """ + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents)-1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i+1] + if (isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = self.contents[i] + b = self.contents[i+1] + b.extract() + n = NavigableString(a+b) + a.replace_with(n) + def index(self, element): """ Find the index of a child by identity, not value. Avoids issues with @@ -1021,7 +981,7 @@ class Tag(PageElement): def __contains__(self, x): return x in self.contents - def __nonzero__(self): + def __bool__(self): "A tag is non-None even if it has no contents." return True @@ -1046,8 +1006,10 @@ class Tag(PageElement): # BS3: soup.aTag -> "soup.find("a") tag_name = tag[:-3] warnings.warn( - '.%sTag is deprecated, use .find("%s") instead.' % ( - tag_name, tag_name)) + '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( + name=tag_name + ) + ) return self.find(tag_name) # We special case contents to avoid recursion. elif not tag.startswith("__") and not tag == "contents": @@ -1108,14 +1070,6 @@ class Tag(PageElement): u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) - def _should_pretty_print(self, indent_level): - """Should this tag be pretty-printed?""" - - return ( - indent_level is not None - and self.name not in self.preserve_whitespace_tags - ) - def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): @@ -1129,31 +1083,32 @@ class Tag(PageElement): encoding. """ - # First off, turn a string formatter into a function. This - # will stop the lookup from happening over and over again. - if not callable(formatter): - formatter = self._formatter_for_name(formatter) - + # First off, turn a non-Formatter `formatter` into a Formatter + # object. This will stop the lookup from happening over and + # over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + attributes = formatter.attributes(self) attrs = [] - if self.attrs: - for key, val in sorted(self.attrs.items()): - if val is None: - decoded = key - else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, basestring): - val = unicode(val) - elif ( + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, str): + val = str(val) + elif ( isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None): - val = val.encode(eventual_encoding) - - text = self.format_string(val, formatter) - decoded = ( - unicode(key) + '=' - + EntitySubstitution.quoted_attribute_value(text)) - attrs.append(decoded) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) close = '' closeTag = '' @@ -1162,7 +1117,7 @@ class Tag(PageElement): prefix = self.prefix + ":" if self.is_empty_element: - close = '/' + close = formatter.void_element_close_prefix or '' else: closeTag = '</%s%s>' % (prefix, self.name) @@ -1177,7 +1132,8 @@ class Tag(PageElement): else: indent_contents = None contents = self.decode_contents( - indent_contents, eventual_encoding, formatter) + indent_contents, eventual_encoding, formatter + ) if self.hidden: # This is the 'document root' object. @@ -1209,6 +1165,13 @@ class Tag(PageElement): s = ''.join(s) return s + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed?""" + return ( + indent_level is not None + and self.name not in self.preserve_whitespace_tags + ) + def prettify(self, encoding=None, formatter="minimal"): if encoding is None: return self.decode(True, formatter=formatter) @@ -1224,19 +1187,19 @@ class Tag(PageElement): indented this many spaces. :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ + encoded into this encoding. decode_contents() is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. - :param formatter: The output formatter responsible for converting - entities to Unicode characters. + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. """ - # First off, turn a string formatter into a function. This + # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not callable(formatter): - formatter = self._formatter_for_name(formatter) + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) pretty_print = (indent_level is not None) s = [] @@ -1247,16 +1210,19 @@ class Tag(PageElement): elif isinstance(c, Tag): s.append(c.decode(indent_level, eventual_encoding, formatter)) - if text and indent_level and not self.name == 'pre': + preserve_whitespace = ( + self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags + ) + if text and indent_level and not preserve_whitespace: text = text.strip() if text: - if pretty_print and not self.name == 'pre': + if pretty_print and not preserve_whitespace: s.append(" " * (indent_level - 1)) s.append(text) - if pretty_print and not self.name == 'pre': + if pretty_print and not preserve_whitespace: s.append("\n") return ''.join(s) - + def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): @@ -1331,236 +1297,41 @@ class Tag(PageElement): current = current.next_element # CSS selector code - - _selector_combinators = ['>', '+', '~'] - _select_debug = False - quoted_colon = re.compile('"[^"]*:[^"]*"') - def select_one(self, selector): + def select_one(self, selector, namespaces=None, **kwargs): """Perform a CSS selection operation on the current element.""" - value = self.select(selector, limit=1) + value = self.select(selector, namespaces, 1, **kwargs) if value: return value[0] return None - def select(self, selector, _candidate_generator=None, limit=None): - """Perform a CSS selection operation on the current element.""" + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. - # Handle grouping selectors if ',' exists, ie: p,a - if ',' in selector: - context = [] - for partial_selector in selector.split(','): - partial_selector = partial_selector.strip() - if partial_selector == '': - raise ValueError('Invalid group selection syntax: %s' % selector) - candidates = self.select(partial_selector, limit=limit) - for candidate in candidates: - if candidate not in context: - context.append(candidate) - - if limit and len(context) >= limit: - break - return context - tokens = shlex.split(selector) - current_context = [self] - - if tokens[-1] in self._selector_combinators: - raise ValueError( - 'Final combinator "%s" is missing an argument.' % tokens[-1]) + This uses the SoupSieve library. - if self._select_debug: - print 'Running CSS selector "%s"' % selector + :param selector: A string containing a CSS selector. - for index, token in enumerate(tokens): - new_context = [] - new_context_ids = set([]) + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. - if tokens[index-1] in self._selector_combinators: - # This token was consumed by the previous combinator. Skip it. - if self._select_debug: - print ' Token was consumed by the previous combinator.' - continue + :param limit: After finding this number of results, stop looking. - if self._select_debug: - print ' Considering token "%s"' % token - recursive_candidate_generator = None - tag_name = None - - # Each operation corresponds to a checker function, a rule - # for determining whether a candidate matches the - # selector. Candidates are generated by the active - # iterator. - checker = None - - m = self.attribselect_re.match(token) - if m is not None: - # Attribute selector - tag_name, attribute, operator, value = m.groups() - checker = self._attribute_checker(operator, attribute, value) - - elif '#' in token: - # ID selector - tag_name, tag_id = token.split('#', 1) - def id_matches(tag): - return tag.get('id', None) == tag_id - checker = id_matches - - elif '.' in token: - # Class selector - tag_name, klass = token.split('.', 1) - classes = set(klass.split('.')) - def classes_match(candidate): - return classes.issubset(candidate.get('class', [])) - checker = classes_match - - elif ':' in token and not self.quoted_colon.search(token): - # Pseudo-class - tag_name, pseudo = token.split(':', 1) - if tag_name == '': - raise ValueError( - "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) - found = [] - if pseudo_attributes is None: - pseudo_type = pseudo - pseudo_value = None - else: - pseudo_type, pseudo_value = pseudo_attributes.groups() - if pseudo_type == 'nth-of-type': - try: - pseudo_value = int(pseudo_value) - except: - raise NotImplementedError( - 'Only numeric values are currently supported for the nth-of-type pseudo-class.') - if pseudo_value < 1: - raise ValueError( - 'nth-of-type pseudo-class value must be at least 1.') - class Counter(object): - def __init__(self, destination): - self.count = 0 - self.destination = destination - - def nth_child_of_type(self, tag): - self.count += 1 - if self.count == self.destination: - return True - else: - return False - checker = Counter(pseudo_value).nth_child_of_type - else: - raise NotImplementedError( - 'Only the following pseudo-classes are implemented: nth-of-type.') - - elif token == '*': - # Star selector -- matches everything - pass - elif token == '>': - # Run the next token as a CSS selector against the - # direct children of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.children - elif token == '~': - # Run the next token as a CSS selector against the - # siblings of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.next_siblings - elif token == '+': - # For each tag in the current context, run the next - # token as a CSS selector against the tag's next - # sibling that's a tag. - def next_tag_sibling(tag): - yield tag.find_next_sibling(True) - recursive_candidate_generator = next_tag_sibling - - elif self.tag_name_re.match(token): - # Just a tag name. - tag_name = token - else: - raise ValueError( - 'Unsupported or invalid CSS selector: "%s"' % token) - if recursive_candidate_generator: - # This happens when the selector looks like "> foo". - # - # The generator calls select() recursively on every - # member of the current context, passing in a different - # candidate generator and a different selector. - # - # In the case of "> foo", the candidate generator is - # one that yields a tag's direct children (">"), and - # the selector is "foo". - next_token = tokens[index+1] - def recursive_select(tag): - if self._select_debug: - print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) - print '-' * 40 - for i in tag.select(next_token, recursive_candidate_generator): - if self._select_debug: - print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) - yield i - if self._select_debug: - print '-' * 40 - _use_candidate_generator = recursive_select - elif _candidate_generator is None: - # By default, a tag's candidates are all of its - # children. If tag_name is defined, only yield tags - # with that name. - if self._select_debug: - if tag_name: - check = "[any]" - else: - check = tag_name - print ' Default candidate generator, tag name="%s"' % check - if self._select_debug: - # This is redundant with later code, but it stops - # a bunch of bogus tags from cluttering up the - # debug log. - def default_candidate_generator(tag): - for child in tag.descendants: - if not isinstance(child, Tag): - continue - if tag_name and not child.name == tag_name: - continue - yield child - _use_candidate_generator = default_candidate_generator - else: - _use_candidate_generator = lambda tag: tag.descendants - else: - _use_candidate_generator = _candidate_generator - - count = 0 - for tag in current_context: - if self._select_debug: - print " Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs)) - for candidate in _use_candidate_generator(tag): - if not isinstance(candidate, Tag): - continue - if tag_name and candidate.name != tag_name: - continue - if checker is not None: - try: - result = checker(candidate) - except StopIteration: - # The checker has decided we should no longer - # run the generator. - break - if checker is None or result: - if self._select_debug: - print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) - if id(candidate) not in new_context_ids: - # If a tag matches a selector more than once, - # don't include it in the context more than once. - new_context.append(candidate) - new_context_ids.add(id(candidate)) - elif self._select_debug: - print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) - - current_context = new_context - if limit and len(current_context) >= limit: - current_context = current_context[:limit] - - if self._select_debug: - print "Final verdict:" - for i in current_context: - print " %s %s" % (i.name, i.attrs) - return current_context + :param kwargs: Any extra arguments you'd like to pass in to + soupsieve.select(). + """ + if namespaces is None: + namespaces = self._namespaces + + if limit is None: + limit = 0 + if soupsieve is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + + return soupsieve.select(selector, self, namespaces, limit, **kwargs) # Old names for backwards compatibility def childGenerator(self): @@ -1603,7 +1374,7 @@ class SoupStrainer(object): else: attrs = kwargs normalized_attrs = {} - for key, value in attrs.items(): + for key, value in list(attrs.items()): normalized_attrs[key] = self._normalize_search_value(value) self.attrs = normalized_attrs @@ -1612,7 +1383,7 @@ class SoupStrainer(object): def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. - if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') + if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value @@ -1625,7 +1396,7 @@ class SoupStrainer(object): new_value = [] for v in value: if (hasattr(v, '__iter__') and not isinstance(v, bytes) - and not isinstance(v, unicode)): + and not isinstance(v, str)): # This is almost certainly the user's mistake. In the # interests of avoiding infinite loops, we'll let # it through as-is rather than doing a recursive call. @@ -1637,7 +1408,7 @@ class SoupStrainer(object): # Otherwise, convert it into a Unicode string. # The unicode(str()) thing is so this will do the same thing on Python 2 # and Python 3. - return unicode(str(value)) + return str(str(value)) def __str__(self): if self.text: @@ -1652,7 +1423,7 @@ class SoupStrainer(object): markup = markup_name markup_attrs = markup call_function_with_tag_data = ( - isinstance(self.name, collections.Callable) + isinstance(self.name, Callable) and not isinstance(markup_name, Tag)) if ((not self.name) @@ -1691,7 +1462,7 @@ class SoupStrainer(object): found = None # If given a list of items, scan it for a text element that # matches. - if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): @@ -1704,7 +1475,7 @@ class SoupStrainer(object): found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): + isinstance(markup, str): if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: @@ -1732,7 +1503,7 @@ class SoupStrainer(object): # True matches any non-None value. return markup is not None - if isinstance(match_against, collections.Callable): + if isinstance(match_against, Callable): return match_against(markup) # Custom callables take the tag as an argument, but all @@ -1749,7 +1520,7 @@ class SoupStrainer(object): return not match_against if (hasattr(match_against, '__iter__') - and not isinstance(match_against, basestring)): + and not isinstance(match_against, str)): # We're asked to match against an iterable of items. # The markup must be match at least one item in the # iterable. We'll try each one in turn. @@ -1776,7 +1547,7 @@ class SoupStrainer(object): # the tag's name and once against its prefixed name. match = False - if not match and isinstance(match_against, unicode): + if not match and isinstance(match_against, str): # Exact string match match = markup == match_against diff --git a/libs/bs4/testing.py b/libs/bs4/testing.py index 6ba2506c4..cc9966601 100644 --- a/libs/bs4/testing.py +++ b/libs/bs4/testing.py @@ -1,7 +1,7 @@ +# encoding: utf-8 """Helper classes for tests.""" -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" import pickle @@ -16,29 +16,66 @@ from bs4.element import ( ContentMetaAttributeValue, Doctype, SoupStrainer, + Tag ) from bs4.builder import HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder +BAD_DOCUMENT = """A bare string +<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> +<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> +<div><![CDATA[A CDATA section where it doesn't belong]]></div> +<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> +<div>A <meta> tag</div> +<div>A <br> tag that supposedly has contents.</br></div> +<div>AT&T</div> +<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div> +<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> +<div>This numeric entity is missing the final semicolon: <x t="piñata"></div> +<div><a href="http://example.com/</a> that attribute value never got closed</div> +<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> +<! This document starts with a bogus declaration ><div>a</div> +<div>This document contains <!an incomplete declaration <div>(do you see it?)</div> +<div>This document ends with <!an incomplete declaration +<div><a style={height:21px;}>That attribute value was bogus</a></div> +<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace +<div><table><td nowrap>That boolean attribute had no value</td></table></div> +<div>Here's a nonexistent entity: &#foo; (do you see it?)</div> +<div>This document ends before the entity finishes: > +<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> +<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> +<div><table><tr><td>Here's a table</td></tr></table></div> +<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> +<div>This tag contains nothing but whitespace: <b> </b></div> +<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> +<div><table><div>This table contains bare markup</div></table></div> +<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> +<div>This document contains a <!DOCTYPE surprise>surprise doctype</div> +<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> +<div><our\u2603>Tag name contains Unicode characters</our\u2603></div> +<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +""" + class SoupTest(unittest.TestCase): @property def default_builder(self): - return default_builder() + return default_builder def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" builder = kwargs.pop('builder', self.default_builder) return BeautifulSoup(markup, builder=builder, **kwargs) - def document_for(self, markup): + def document_for(self, markup, **kwargs): """Turn an HTML fragment into a document. The details depend on the builder. """ - return self.default_builder.test_fragment_to_document(markup) + return self.default_builder(**kwargs).test_fragment_to_document(markup) def assertSoupEquals(self, to_parse, compare_parsed_to=None): builder = self.default_builder @@ -59,6 +96,121 @@ class SoupTest(unittest.TestCase): self.assertEqual(earlier, e.previous_element) earlier = e + def linkage_validator(self, el, _recursive_call=False): + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert el.previous_element is None,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert el.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert el.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert el.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert child.previous_element is el,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert child.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert child.previous_sibling is el.contents[idx - 1],\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert el.contents[idx - 1].next_sibling is child,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert child.previous_element is last_child,\ + "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert last_child.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + # A bubbled up descendant should have no next siblings + assert descendant.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert child.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target = el + while True: + if target is None: + assert child.next_element is None, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert child.next_element is target.next_sibling, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + class HTMLTreeBuilderSmokeTest(object): """A basic test of a treebuilder's competence. @@ -80,7 +232,7 @@ class HTMLTreeBuilderSmokeTest(object): soup = self.soup("") new_tag = soup.new_tag(name) self.assertEqual(True, new_tag.is_empty_element) - + def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical # to the original. @@ -150,12 +302,20 @@ class HTMLTreeBuilderSmokeTest(object): soup.encode("utf-8").replace(b"\n", b""), markup.replace(b"\n", b"")) + def test_namespaced_html(self): + """When a namespaced XML document is parsed as HTML it should + be treated as HTML with weird tag names. + """ + markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>""" + soup = self.soup(markup) + self.assertEqual(2, len(soup.find_all("ns1:foo"))) + def test_processing_instruction(self): # We test both Unicode and bytestring to verify that # process_markup correctly sets processing_instruction_class # even when the markup is already Unicode and there is no # need to process anything. - markup = u"""<?PITarget PIContent?>""" + markup = """<?PITarget PIContent?>""" soup = self.soup(markup) self.assertEqual(markup, soup.decode()) @@ -292,6 +452,18 @@ Hello, world! "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + def test_multivalued_attribute_with_whitespace(self): + # Whitespace separating the values of a multi-valued attribute + # should be ignored. + + markup = '<div class=" foo bar "></a>' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.div['class']) + + # If you search by the literal name of the class it's like the whitespace + # wasn't there. + self.assertEqual(soup.div, soup.find('div', class_="foo bar")) + def test_deeply_nested_multivalued_attribute(self): # html5lib can set the attributes of the same tag many times # as it rearranges the tree. This has caused problems with @@ -311,15 +483,41 @@ Hello, world! def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') + def test_strings_resembling_character_entity_references(self): + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( + "<p>• AT&T is in the s&p 500</p>", + "<p>\u2022 AT&T is in the s&p 500</p>" + ) + + def test_apos_entity(self): + self.assertSoupEquals( + "<p>Bob's Bar</p>", + "<p>Bob's Bar</p>", + ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "<p>“Hello” -☃</p>" + soup = self.soup(markup) + self.assertEqual("“Hello” -☃", soup.p.string) + def test_entities_in_attributes_converted_to_unicode(self): - expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' + expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' self.assertSoupEquals('<p id="piñata"></p>', expect) self.assertSoupEquals('<p id="piñata"></p>', expect) self.assertSoupEquals('<p id="piñata"></p>', expect) self.assertSoupEquals('<p id="piñata"></p>', expect) def test_entities_in_text_converted_to_unicode(self): - expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' + expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' self.assertSoupEquals("<p>piñata</p>", expect) self.assertSoupEquals("<p>piñata</p>", expect) self.assertSoupEquals("<p>piñata</p>", expect) @@ -330,11 +528,11 @@ Hello, world! '<p>I said "good day!"</p>') def test_out_of_range_entity(self): - expect = u"\N{REPLACEMENT CHARACTER}" + expect = "\N{REPLACEMENT CHARACTER}" self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) - + def test_multipart_strings(self): "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") @@ -408,9 +606,9 @@ Hello, world! # A seemingly innocuous document... but it's in Unicode! And # it contains characters that can't be represented in the # encoding found in the declaration! The horror! - markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' + markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) + self.assertEqual('Sacr\xe9 bleu!', soup.body.string) def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" @@ -450,7 +648,7 @@ Hello, world! # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "<p><<sacré bleu!>></p>" - expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" + expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): @@ -460,15 +658,15 @@ Hello, world! soup = self.soup(quote) self.assertEqual( soup.p.string, - u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup("<a> </a>") - self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) def test_entities_converted_on_the_way_out(self): text = "<p><<sacré bleu!>></p>" - expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") + expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") soup = self.soup(text) self.assertEqual(soup.p.encode("utf-8"), expected) @@ -477,7 +675,7 @@ Hello, world! # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. - unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' + unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. @@ -586,6 +784,13 @@ Hello, world! data.a['foo'] = 'bar' self.assertEqual('<a foo="bar">text</a>', data.a.decode()) + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + class XMLTreeBuilderSmokeTest(object): def test_pickle_and_unpickle_identity(self): @@ -624,6 +829,17 @@ class XMLTreeBuilderSmokeTest(object): self.assertEqual( soup.encode("utf-8"), markup) + def test_nested_namespaces(self): + doc = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> +<parent xmlns="http://ns1/"> +<child xmlns="http://ns2/" xmlns:ns3="http://ns3/"> +<grandchild ns3:attr="value" xmlns="http://ns4/"/> +</child> +</parent>""" + soup = self.soup(doc) + self.assertEqual(doc, soup.encode()) + def test_formatter_processes_script_tag_for_xml_documents(self): doc = """ <script type="text/javascript"> @@ -637,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object): self.assertTrue(b"< < hey > >" in encoded) def test_can_parse_unicode_document(self): - markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) + self.assertEqual('Sacr\xe9 bleu!', soup.root.string) def test_popping_namespaced_tag(self): markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' soup = self.soup(markup) self.assertEqual( - unicode(soup.rss), markup) + str(soup.rss), markup) def test_docstring_includes_correct_encoding(self): soup = self.soup("<root/>") @@ -676,17 +892,17 @@ class XMLTreeBuilderSmokeTest(object): def test_closing_namespaced_tag(self): markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' soup = self.soup(markup) - self.assertEqual(unicode(soup.p), markup) + self.assertEqual(str(soup.p), markup) def test_namespaced_attributes(self): markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) + self.assertEqual(str(soup.foo), markup) def test_namespaced_attributes_xml_namespace(self): markup = '<foo xml:lang="fr">bar</foo>' soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) + self.assertEqual(str(soup.foo), markup) def test_find_by_prefixed_name(self): doc = """<?xml version="1.0" encoding="utf-8"?> @@ -721,6 +937,12 @@ class XMLTreeBuilderSmokeTest(object): # The two tags have the same namespace prefix. self.assertEqual(tag.prefix, duplicate.prefix) + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" diff --git a/libs/bs4/tests/test_html5lib.py b/libs/bs4/tests/test_html5lib.py index 0f89d6244..96529b0b3 100644 --- a/libs/bs4/tests/test_html5lib.py +++ b/libs/bs4/tests/test_html5lib.py @@ -5,7 +5,7 @@ import warnings try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True -except ImportError, e: +except ImportError as e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( @@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): @property def default_builder(self): - return HTML5TreeBuilder() + return HTML5TreeBuilder def test_soupstrainer(self): # The html5lib tree builder does not support SoupStrainers. @@ -74,14 +74,14 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): def test_reparented_markup(self): markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' soup = self.soup(markup) - self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) + self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) def test_reparented_markup_ends_with_whitespace(self): markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' soup = self.soup(markup) - self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) + self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) def test_reparented_markup_containing_identical_whitespace_nodes(self): @@ -127,4 +127,44 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): def test_foster_parenting(self): markup = b"""<table><td></tbody>A""" soup = self.soup(markup) - self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode()) + self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode()) + + def test_extraction(self): + """ + Test that extraction does not destroy the tree. + + https://bugs.launchpad.net/beautifulsoup/+bug/1782928 + """ + + markup = """ +<html><head></head> +<style> +</style><script></script><body><p>hello</p></body></html> +""" + soup = self.soup(markup) + [s.extract() for s in soup('script')] + [s.extract() for s in soup('style')] + + self.assertEqual(len(soup.find_all("p")), 1) + + def test_empty_comment(self): + """ + Test that empty comment does not break structure. + + https://bugs.launchpad.net/beautifulsoup/+bug/1806598 + """ + + markup = """ +<html> +<body> +<form> +<!----><input type="text"> +</form> +</body> +</html> +""" + soup = self.soup(markup) + inputs = [] + for form in soup.find_all('form'): + inputs.extend(form.find_all('input')) + self.assertEqual(len(inputs), 1) diff --git a/libs/bs4/tests/test_htmlparser.py b/libs/bs4/tests/test_htmlparser.py index d5cf0253f..790489aa1 100644 --- a/libs/bs4/tests/test_htmlparser.py +++ b/libs/bs4/tests/test_htmlparser.py @@ -5,12 +5,11 @@ from pdb import set_trace import pickle from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder +from bs4.builder._htmlparser import BeautifulSoupHTMLParser class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): - @property - def default_builder(self): - return HTMLParserTreeBuilder() + default_builder = HTMLParserTreeBuilder def test_namespaced_system_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. @@ -32,3 +31,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_redundant_empty_element_closing_tags(self): self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>") self.assertSoupEquals('</br></br></br>', "") + + def test_empty_element(self): + # This verifies that any buffered data present when the parser + # finishes working is handled. + self.assertSoupEquals("foo &# bar", "foo &# bar") + + +class TestHTMLParserSubclass(SoupTest): + def test_error(self): + """Verify that our HTMLParser subclass implements error() in a way + that doesn't cause a crash. + """ + parser = BeautifulSoupHTMLParser() + parser.error("don't crash") diff --git a/libs/bs4/tests/test_lxml.py b/libs/bs4/tests/test_lxml.py index a05870b91..29da71149 100644 --- a/libs/bs4/tests/test_lxml.py +++ b/libs/bs4/tests/test_lxml.py @@ -7,7 +7,7 @@ try: import lxml.etree LXML_PRESENT = True LXML_VERSION = lxml.etree.LXML_VERSION -except ImportError, e: +except ImportError as e: LXML_PRESENT = False LXML_VERSION = (0,) @@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @property def default_builder(self): - return LXMLTreeBuilder() + return LXMLTreeBuilder def test_out_of_range_entity(self): self.assertSoupEquals( @@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") + def test_entities_in_foreign_document_encoding(self): + # We can't implement this case correctly because by the time we + # hear about markup like "“", it's been (incorrectly) converted into + # a string like u'\x93' + pass + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # test if an old version of lxml is installed. @@ -62,7 +68,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): # if one is installed. with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("<b />") - self.assertEqual(u"<b/>", unicode(soup.b)) + self.assertEqual("<b/>", str(soup.b)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) @skipIf( @@ -73,4 +79,22 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): @property def default_builder(self): - return LXMLTreeBuilderForXML() + return LXMLTreeBuilderForXML + + def test_namespace_indexing(self): + # We should not track un-prefixed namespaces as we can only hold one + # and it will be recognized as the default namespace by soupsieve, + # which may be confusing in some situations. When no namespace is provided + # for a selector, the default namespace (if defined) is assumed. + + soup = self.soup( + '<?xml version="1.1"?>\n' + '<root>' + '<tag xmlns="http://unprefixed-namespace.com">content</tag>' + '<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>' + '</root>' + ) + self.assertEqual( + soup._namespaces, + {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'} + ) diff --git a/libs/bs4/tests/test_soup.py b/libs/bs4/tests/test_soup.py index f3e69edf3..1eda9484b 100644 --- a/libs/bs4/tests/test_soup.py +++ b/libs/bs4/tests/test_soup.py @@ -24,6 +24,7 @@ from bs4.dammit import ( EncodingDetector, ) from bs4.testing import ( + default_builder, SoupTest, skipIf, ) @@ -32,7 +33,7 @@ import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True -except ImportError, e: +except ImportError as e: LXML_PRESENT = False PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) @@ -40,21 +41,86 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) class TestConstructor(SoupTest): def test_short_unicode_input(self): - data = u"<h1>éé</h1>" + data = "<h1>éé</h1>" soup = self.soup(data) - self.assertEqual(u"éé", soup.h1.string) + self.assertEqual("éé", soup.h1.string) def test_embedded_null(self): - data = u"<h1>foo\0bar</h1>" + data = "<h1>foo\0bar</h1>" soup = self.soup(data) - self.assertEqual(u"foo\0bar", soup.h1.string) + self.assertEqual("foo\0bar", soup.h1.string) def test_exclude_encodings(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual("windows-1252", soup.original_encoding) - + def test_custom_builder_class(self): + # Verify that you can pass in a custom Builder class and + # it'll be instantiated with the appropriate keyword arguments. + class Mock(object): + def __init__(self, **kwargs): + self.called_with = kwargs + self.is_xml = True + def initialize_soup(self, soup): + pass + def prepare_markup(self, *args, **kwargs): + return '' + + kwargs = dict( + var="value", + # This is a deprecated BS3-era keyword argument, which + # will be stripped out. + convertEntities=True, + ) + with warnings.catch_warnings(record=True): + soup = BeautifulSoup('', builder=Mock, **kwargs) + assert isinstance(soup.builder, Mock) + self.assertEqual(dict(var="value"), soup.builder.called_with) + + # You can also instantiate the TreeBuilder yourself. In this + # case, that specific object is used and any keyword arguments + # to the BeautifulSoup constructor are ignored. + builder = Mock(**kwargs) + with warnings.catch_warnings(record=True) as w: + soup = BeautifulSoup( + '', builder=builder, ignored_value=True, + ) + msg = str(w[0].message) + assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.") + self.assertEqual(builder, soup.builder) + self.assertEqual(kwargs, builder.called_with) + + def test_cdata_list_attributes(self): + # Most attribute values are represented as scalars, but the + # HTML standard says that some attributes, like 'class' have + # space-separated lists as values. + markup = '<a id=" an id " class=" a class "></a>' + soup = self.soup(markup) + + # Note that the spaces are stripped for 'class' but not for 'id'. + a = soup.a + self.assertEqual(" an id ", a['id']) + self.assertEqual(["a", "class"], a['class']) + + # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets + # you customize or disable this. As always, you can customize the TreeBuilder + # by passing in a keyword argument to the BeautifulSoup constructor. + soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) + self.assertEqual(" a class ", soup.a['class']) + + # Here are two ways of saying that `id` is a multi-valued + # attribute in this context, but 'class' is not. + for switcheroo in ({'*': 'id'}, {'a': 'id'}): + with warnings.catch_warnings(record=True) as w: + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo) + a = soup.a + self.assertEqual(["an", "id"], a['id']) + self.assertEqual(" a class ", a['class']) + + class TestWarnings(SoupTest): def _no_parser_specified(self, s, is_there=True): @@ -129,7 +195,7 @@ class TestWarnings(SoupTest): with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning - soup = self.soup(u"http://www.crummyunicode.com/") + soup = self.soup("http://www.crummyunicode.com/") self.assertTrue(any("looks like a URL" in str(w.message) for w in warning_list)) @@ -141,7 +207,7 @@ class TestWarnings(SoupTest): def test_url_warning_with_unicode_and_space(self): with warnings.catch_warnings(record=True) as warning_list: - soup = self.soup(u"http://www.crummyuncode.com/ is great") + soup = self.soup("http://www.crummyuncode.com/ is great") self.assertFalse(any("looks like a URL" in str(w.message) for w in warning_list)) @@ -163,9 +229,9 @@ class TestEntitySubstitution(unittest.TestCase): def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. - s = u"foo\u2200\N{SNOWMAN}\u00f5bar" + s = "foo\u2200\N{SNOWMAN}\u00f5bar" self.assertEqual(self.sub.substitute_html(s), - u"foo∀\N{SNOWMAN}õbar") + "foo∀\N{SNOWMAN}õbar") def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we @@ -217,7 +283,7 @@ class TestEntitySubstitution(unittest.TestCase): self.assertEqual( self.sub.substitute_xml_containing_entities("ÁT&T"), "ÁT&T") - + def test_quotes_not_html_substituted(self): """There's no need to do this except inside attribute values.""" text = 'Bob\'s "bar"' @@ -230,7 +296,7 @@ class TestEncodingConversion(SoupTest): def setUp(self): super(TestEncodingConversion, self).setUp() - self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' + self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( @@ -250,7 +316,7 @@ class TestEncodingConversion(SoupTest): ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) + self.assertTrue(isinstance(unicode_output, str)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: @@ -262,7 +328,7 @@ class TestEncodingConversion(SoupTest): # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) - self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): @@ -270,7 +336,7 @@ class TestEncodingConversion(SoupTest): # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) - self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. @@ -281,14 +347,14 @@ class TestEncodingConversion(SoupTest): PYTHON_3_PRE_3_2, "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") def test_attribute_name_containing_unicode_characters(self): - markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' + markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of UnicodeDammit.""" def test_unicode_input(self): - markup = u"I'm already Unicode! \N{SNOWMAN}" + markup = "I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, markup) @@ -296,7 +362,7 @@ class TestUnicodeDammit(unittest.TestCase): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEqual( - dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") + dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") def test_smart_quotes_to_xml_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" @@ -320,14 +386,14 @@ class TestUnicodeDammit(unittest.TestCase): utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}') + self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') - self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" @@ -336,19 +402,19 @@ class TestUnicodeDammit(unittest.TestCase): self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') def test_ignore_invalid_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') def test_exclude_encodings(self): # This is UTF-8. - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") # But if we exclude UTF-8 from consideration, the guess is # Windows-1252. @@ -364,7 +430,7 @@ class TestUnicodeDammit(unittest.TestCase): detected = EncodingDetector( b'<?xml version="1.0" encoding="UTF-\xdb" ?>') encodings = list(detected.encodings) - assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings + assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings def test_detect_html5_style_meta_tag(self): @@ -404,7 +470,7 @@ class TestUnicodeDammit(unittest.TestCase): bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) + self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) @@ -416,17 +482,17 @@ class TestUnicodeDammit(unittest.TestCase): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) - self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) + self.assertEqual("<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding) def test_detwingle(self): # Here's a UTF8 document. - utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") + utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ( - u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" - u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") + "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 @@ -441,7 +507,7 @@ class TestUnicodeDammit(unittest.TestCase): fixed = UnicodeDammit.detwingle(doc) self.assertEqual( - u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) + "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending @@ -449,9 +515,9 @@ class TestUnicodeDammit(unittest.TestCase): # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( - u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' - u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' - u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) diff --git a/libs/bs4/tests/test_tree.py b/libs/bs4/tests/test_tree.py index c0e7c4080..3b4beeb8f 100644 --- a/libs/bs4/tests/test_tree.py +++ b/libs/bs4/tests/test_tree.py @@ -1,4 +1,3 @@ - # -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. @@ -26,6 +25,7 @@ from bs4.element import ( Comment, Declaration, Doctype, + Formatter, NavigableString, SoupStrainer, Tag, @@ -71,13 +71,13 @@ class TestFind(TreeTest): self.assertEqual(soup.find("b").string, "2") def test_unicode_text_find(self): - soup = self.soup(u'<h1>Räksmörgås</h1>') - self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås') + soup = self.soup('<h1>Räksmörgås</h1>') + self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') def test_unicode_attribute_find(self): - soup = self.soup(u'<h1 id="Räksmörgås">here it is</h1>') + soup = self.soup('<h1 id="Räksmörgås">here it is</h1>') str(soup) - self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text) + self.assertEqual("here it is", soup.find(id='Räksmörgås').text) def test_find_everything(self): @@ -97,17 +97,17 @@ class TestFindAll(TreeTest): """You can search the tree for text nodes.""" soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") # Exact match. - self.assertEqual(soup.find_all(string="bar"), [u"bar"]) - self.assertEqual(soup.find_all(text="bar"), [u"bar"]) + self.assertEqual(soup.find_all(string="bar"), ["bar"]) + self.assertEqual(soup.find_all(text="bar"), ["bar"]) # Match any of a number of strings. self.assertEqual( - soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) + soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) # Match a regular expression. self.assertEqual(soup.find_all(text=re.compile('.*')), - [u"Foo", u"bar", u'\xbb']) + ["Foo", "bar", '\xbb']) # Match anything. self.assertEqual(soup.find_all(text=True), - [u"Foo", u"bar", u'\xbb']) + ["Foo", "bar", '\xbb']) def test_find_all_limit(self): """You can limit the number of items returned by find_all.""" @@ -250,8 +250,8 @@ class TestFindAllByAttribute(TreeTest): ["Matching a.", "Matching b."]) def test_find_all_by_utf8_attribute_value(self): - peace = u"םולש".encode("utf8") - data = u'<a title="םולש"></a>'.encode("utf8") + peace = "םולש".encode("utf8") + data = '<a title="םולש"></a>'.encode("utf8") soup = self.soup(data) self.assertEqual([soup.a], soup.find_all(title=peace)) self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) @@ -417,6 +417,48 @@ class TestFindAllByAttribute(TreeTest): self.assertEqual([], soup.find_all(id=1, text="bar")) +class TestSmooth(TreeTest): + """Test Tag.smooth.""" + + def test_smooth(self): + soup = self.soup("<div>a</div>") + div = soup.div + div.append("b") + div.append("c") + div.append(Comment("Comment 1")) + div.append(Comment("Comment 2")) + div.append("d") + builder = self.default_builder() + span = Tag(soup, builder, 'span') + span.append('1') + span.append('2') + div.append(span) + + # At this point the tree has a bunch of adjacent + # NavigableStrings. This is normal, but it has no meaning in + # terms of HTML, so we may want to smooth things out for + # output. + + # Since the <span> tag has two children, its .string is None. + self.assertEqual(None, div.span.string) + + self.assertEqual(7, len(div.contents)) + div.smooth() + self.assertEqual(5, len(div.contents)) + + # The three strings at the beginning of div.contents have been + # merged into on string. + # + self.assertEqual('abc', div.contents[0]) + + # The call is recursive -- the <span> tag was also smoothed. + self.assertEqual('12', div.span.string) + + # The two comments have _not_ been merged, even though + # comments are strings. Merging comments would change the + # meaning of the HTML. + self.assertEqual('Comment 1', div.contents[1]) + self.assertEqual('Comment 2', div.contents[2]) class TestIndex(TreeTest): @@ -605,7 +647,7 @@ class SiblingTest(TreeTest): </html>''' # All that whitespace looks good but makes the tests more # difficult. Get rid of it. - markup = re.compile("\n\s*").sub("", markup) + markup = re.compile(r"\n\s*").sub("", markup) self.tree = self.soup(markup) @@ -703,12 +745,12 @@ class TestTagCreation(SoupTest): """Test the ability to create new tags.""" def test_new_tag(self): soup = self.soup("") - new_tag = soup.new_tag("foo", bar="baz") + new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"}) self.assertTrue(isinstance(new_tag, Tag)) self.assertEqual("foo", new_tag.name) - self.assertEqual(dict(bar="baz"), new_tag.attrs) + self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs) self.assertEqual(None, new_tag.parent) - + def test_tag_inherits_self_closing_rules_from_builder(self): if XML_BUILDER_PRESENT: xml_soup = BeautifulSoup("", "lxml-xml") @@ -821,6 +863,26 @@ class TestTreeModification(SoupTest): soup = self.soup(text) self.assertRaises(ValueError, soup.a.insert, 0, soup.a) + def test_insert_beautifulsoup_object_inserts_children(self): + """Inserting one BeautifulSoup object into another actually inserts all + of its children -- you'll never combine BeautifulSoup objects. + """ + soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>") + + text = "<p>p2</p><p>p3</p>" + to_insert = self.soup(text) + soup.insert(1, to_insert) + + for i in soup.descendants: + assert not isinstance(i, BeautifulSoup) + + p1, p2, p3, p4 = list(soup.children) + self.assertEqual("And now, a word:", p1.string) + self.assertEqual("p2", p2.string) + self.assertEqual("p3", p3.string) + self.assertEqual("And we're back.", p4.string) + + def test_replace_with_maintains_next_element_throughout(self): soup = self.soup('<p><a>one</a><b>three</b></p>') a = soup.a @@ -877,7 +939,7 @@ class TestTreeModification(SoupTest): self.assertEqual(soup.a.contents[0].next_element, "bar") def test_insert_tag(self): - builder = self.default_builder + builder = self.default_builder() soup = self.soup( "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) magic_tag = Tag(soup, builder, 'magictag') @@ -912,6 +974,13 @@ class TestTreeModification(SoupTest): soup.a.append(soup.b) self.assertEqual(data, soup.decode()) + def test_extend(self): + data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>" + soup = self.soup(data) + l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b] + soup.a.extend(l) + self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode()) + def test_move_tag_to_beginning_of_parent(self): data = "<a><b></b><c></c><d></d></a>" soup = self.soup(data) @@ -938,6 +1007,29 @@ class TestTreeModification(SoupTest): self.assertEqual( soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) + # Can't insert an element before itself. + b = soup.b + self.assertRaises(ValueError, b.insert_before, b) + + # Can't insert before if an element has no parent. + b.extract() + self.assertRaises(ValueError, b.insert_before, "nope") + + # Can insert an identical element + soup = self.soup("<a>") + soup.a.insert_before(soup.new_tag("a")) + + def test_insert_multiple_before(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_before("BAZ", " ", "QUUX") + soup.a.insert_before("QUUX", " ", "BAZ") + self.assertEqual( + soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>")) + + soup.a.insert_before(soup.b, "FOO") + self.assertEqual( + soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX")) + def test_insert_after(self): soup = self.soup("<a>foo</a><b>bar</b>") soup.b.insert_after("BAZ") @@ -948,6 +1040,28 @@ class TestTreeModification(SoupTest): self.assertEqual( soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) + # Can't insert an element after itself. + b = soup.b + self.assertRaises(ValueError, b.insert_after, b) + + # Can't insert after if an element has no parent. + b.extract() + self.assertRaises(ValueError, b.insert_after, "nope") + + # Can insert an identical element + soup = self.soup("<a>") + soup.a.insert_before(soup.new_tag("a")) + + def test_insert_multiple_after(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_after("BAZ", " ", "QUUX") + soup.a.insert_after("QUUX", " ", "BAZ") + self.assertEqual( + soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX")) + soup.b.insert_after(soup.a, "FOO ") + self.assertEqual( + soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX")) + def test_insert_after_raises_exception_if_after_has_no_meaning(self): soup = self.soup("") tag = soup.new_tag("a") @@ -1111,7 +1225,7 @@ class TestTreeModification(SoupTest): <script>baz</script> </html>""") [soup.script.extract() for i in soup.find_all("script")] - self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body)) + self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body)) def test_extract_works_when_element_is_surrounded_by_identical_strings(self): @@ -1186,7 +1300,7 @@ class TestElementObjects(SoupTest): tag = soup.bTag self.assertEqual(soup.b, tag) self.assertEqual( - '.bTag is deprecated, use .find("b") instead.', + '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")', str(w[0].message)) def test_has_attr(self): @@ -1349,19 +1463,19 @@ class TestPersistence(SoupTest): soup = BeautifulSoup(b'<p> </p>', 'html.parser') encoding = soup.original_encoding copy = soup.__copy__() - self.assertEqual(u"<p> </p>", unicode(copy)) + self.assertEqual("<p> </p>", str(copy)) self.assertEqual(encoding, copy.original_encoding) def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) loaded = pickle.loads(dumped) self.assertEqual(loaded.decode(), soup.decode()) def test_copy_navigablestring_is_not_attached_to_tree(self): - html = u"<b>Foo<a></a></b><b>Bar</b>" + html = "<b>Foo<a></a></b><b>Bar</b>" soup = self.soup(html) s1 = soup.find(string="Foo") s2 = copy.copy(s1) @@ -1373,7 +1487,7 @@ class TestPersistence(SoupTest): self.assertEqual(None, s2.previous_element) def test_copy_navigablestring_subclass_has_same_type(self): - html = u"<b><!--Foo--></b>" + html = "<b><!--Foo--></b>" soup = self.soup(html) s1 = soup.string s2 = copy.copy(s1) @@ -1381,19 +1495,19 @@ class TestPersistence(SoupTest): self.assertTrue(isinstance(s2, Comment)) def test_copy_entire_soup(self): - html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end" + html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" soup = self.soup(html) soup_copy = copy.copy(soup) self.assertEqual(soup, soup_copy) def test_copy_tag_copies_contents(self): - html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end" + html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" soup = self.soup(html) div = soup.div div_copy = copy.copy(div) # The two tags look the same, and evaluate to equal. - self.assertEqual(unicode(div), unicode(div_copy)) + self.assertEqual(str(div), str(div_copy)) self.assertEqual(div, div_copy) # But they're not the same object. @@ -1409,67 +1523,75 @@ class TestPersistence(SoupTest): class TestSubstitutions(SoupTest): def test_default_formatter_is_minimal(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. self.assertEqual( decoded, self.document_for( - u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_html(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter="html") self.assertEqual( decoded, - self.document_for("<b><<Sacré bleu!>></b>")) + self.document_for("<br/><b><<Sacré bleu!>></b>")) + def test_formatter_html5(self): + markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + self.assertEqual( + decoded, + self.document_for("<br><b><<Sacré bleu!>></b>")) + def test_formatter_minimal(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. self.assertEqual( decoded, self.document_for( - u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_null(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter=None) # Neither the angle brackets nor the e-with-acute are converted. # This is not valid HTML, but it's what the user wanted. self.assertEqual(decoded, - self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_custom(self): - markup = u"<b><foo></b><b>bar</b>" + markup = "<b><foo></b><b>bar</b><br/>" soup = self.soup(markup) decoded = soup.decode(formatter = lambda x: x.upper()) # Instead of normal entity conversion code, the custom # callable is called on every string. self.assertEqual( decoded, - self.document_for(u"<b><FOO></b><b>BAR</b>")) + self.document_for("<b><FOO></b><b>BAR</b><br/>")) def test_formatter_is_run_on_attribute_values(self): - markup = u'<a href="http://a.com?a=b&c=é">e</a>' + markup = '<a href="http://a.com?a=b&c=é">e</a>' soup = self.soup(markup) a = soup.a - expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>' + expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' self.assertEqual(expect_minimal, a.decode()) self.assertEqual(expect_minimal, a.decode(formatter="minimal")) - expect_html = u'<a href="http://a.com?a=b&c=é">e</a>' + expect_html = '<a href="http://a.com?a=b&c=é">e</a>' self.assertEqual(expect_html, a.decode(formatter="html")) self.assertEqual(markup, a.decode(formatter=None)) - expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' + expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) def test_formatter_skips_script_tag_for_html_documents(self): @@ -1491,28 +1613,28 @@ class TestSubstitutions(SoupTest): self.assertTrue(b"< < hey > >" in encoded) def test_prettify_leaves_preformatted_text_alone(self): - soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ") + soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>") # Everything outside the <pre> tag is reformatted, but everything # inside is left alone. self.assertEqual( - u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', + '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>', soup.div.prettify()) - def test_prettify_accepts_formatter(self): + def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) self.assertTrue("FOO" in pretty) def test_prettify_outputs_unicode_by_default(self): soup = self.soup("<a></a>") - self.assertEqual(unicode, type(soup.prettify())) + self.assertEqual(str, type(soup.prettify())) def test_prettify_can_encode_data(self): soup = self.soup("<a></a>") self.assertEqual(bytes, type(soup.prettify("utf-8"))) def test_html_entity_substitution_off_by_default(self): - markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" + markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" soup = self.soup(markup) encoded = soup.b.encode("utf-8") self.assertEqual(encoded, markup.encode('utf-8')) @@ -1556,54 +1678,77 @@ class TestEncoding(SoupTest): """Test the ability to encode objects into strings.""" def test_unicode_string_can_be_encoded(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual(soup.b.string.encode("utf-8"), - u"\N{SNOWMAN}".encode("utf-8")) + "\N{SNOWMAN}".encode("utf-8")) def test_tag_containing_unicode_string_can_be_encoded(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual( soup.b.encode("utf-8"), html.encode("utf-8")) def test_encoding_substitutes_unrecognized_characters_by_default(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") def test_encoding_can_be_made_strict(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertRaises( UnicodeEncodeError, soup.encode, "ascii", errors="strict") def test_decode_contents(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) - self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) + self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) def test_encode_contents(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( + "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( encoding="utf8")) def test_deprecated_renderContents(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) def test_repr(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) if PY3K: self.assertEqual(html, repr(soup)) else: self.assertEqual(b'<b>\\u2603</b>', repr(soup)) +class TestFormatter(SoupTest): + + def test_sort_attributes(self): + # Test the ability to override Formatter.attributes() to, + # e.g., disable the normal sorting of attributes. + class UnsortedFormatter(Formatter): + def attributes(self, tag): + self.called_with = tag + for k, v in sorted(tag.attrs.items()): + if k == 'ignore': + continue + yield k,v + + soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') + formatter = UnsortedFormatter() + decoded = soup.decode(formatter=formatter) + + # attributes() was called on the <p> tag. It filtered out one + # attribute and sorted the other two. + self.assertEqual(formatter.called_with, soup.p) + self.assertEqual('<p aval="2" cval="1"></p>', decoded) + + class TestNavigableStringSubclasses(SoupTest): def test_cdata(self): @@ -1720,7 +1865,7 @@ class TestSoupSelector(TreeTest): els = self.soup.select('title') self.assertEqual(len(els), 1) self.assertEqual(els[0].name, 'title') - self.assertEqual(els[0].contents, [u'The title']) + self.assertEqual(els[0].contents, ['The title']) def test_one_tag_many(self): els = self.soup.select('div') @@ -1755,7 +1900,7 @@ class TestSoupSelector(TreeTest): self.assertEqual(len(self.soup.select('del')), 0) def test_invalid_tag(self): - self.assertRaises(ValueError, self.soup.select, 'tag%t') + self.assertRaises(SyntaxError, self.soup.select, 'tag%t') def test_select_dashed_tag_ids(self): self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) @@ -1766,7 +1911,7 @@ class TestSoupSelector(TreeTest): self.assertEqual(dashed[0]['id'], 'dash2') def test_dashed_tag_text(self): - self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.') + self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') def test_select_dashed_matches_find_all(self): self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) @@ -1946,32 +2091,31 @@ class TestSoupSelector(TreeTest): NotImplementedError, self.soup.select, "a:no-such-pseudoclass") self.assertRaises( - NotImplementedError, self.soup.select, "a:nth-of-type(a)") - + SyntaxError, self.soup.select, "a:nth-of-type(a)") def test_nth_of_type(self): # Try to select first paragraph els = self.soup.select('div#inner p:nth-of-type(1)') self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') + self.assertEqual(els[0].string, 'Some text') # Try to select third paragraph els = self.soup.select('div#inner p:nth-of-type(3)') self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Another') + self.assertEqual(els[0].string, 'Another') # Try to select (non-existent!) fourth paragraph els = self.soup.select('div#inner p:nth-of-type(4)') self.assertEqual(len(els), 0) - # Pass in an invalid value. - self.assertRaises( - ValueError, self.soup.select, 'div p:nth-of-type(0)') + # Zero will select no tags. + els = self.soup.select('div p:nth-of-type(0)') + self.assertEqual(len(els), 0) def test_nth_of_type_direct_descendant(self): els = self.soup.select('div#inner > p:nth-of-type(1)') self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') + self.assertEqual(els[0].string, 'Some text') def test_id_child_selector_nth_of_type(self): self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) @@ -2003,7 +2147,7 @@ class TestSoupSelector(TreeTest): self.assertEqual([], self.soup.select('#inner ~ h2')) def test_dangling_combinator(self): - self.assertRaises(ValueError, self.soup.select, 'h1 >') + self.assertRaises(SyntaxError, self.soup.select, 'h1 >') def test_sibling_combinator_wont_select_same_tag_twice(self): self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) @@ -2034,8 +2178,8 @@ class TestSoupSelector(TreeTest): self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) def test_invalid_multiple_select(self): - self.assertRaises(ValueError, self.soup.select, ',x, y') - self.assertRaises(ValueError, self.soup.select, 'x,,y') + self.assertRaises(SyntaxError, self.soup.select, ',x, y') + self.assertRaises(SyntaxError, self.soup.select, 'x,,y') def test_multiple_select_attrs(self): self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) @@ -2046,5 +2190,16 @@ class TestSoupSelector(TreeTest): def test_multiple_select_nested(self): self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) - - + def test_select_duplicate_elements(self): + # When markup contains duplicate elements, a multiple select + # will find all of them. + markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' + soup = BeautifulSoup(markup, 'html.parser') + selected = soup.select(".c1, .c2") + self.assertEqual(3, len(selected)) + + # Verify that find_all finds the same elements, though because + # of an implementation detail it finds them in a different + # order. + for element in soup.find_all(class_=['c1', 'c2']): + assert element in selected |