Browse Source

[utils] Add get_element_by_class

For #9950
totalwebcasting
Yen Chi Hsuan 8 years ago
parent
commit
84c237fb8a
No known key found for this signature in database GPG Key ID: 3FDDD575826C5C30
2 changed files with 19 additions and 2 deletions
  1. +9
    -0
      test/test_utils.py
  2. +10
    -2
      youtube_dl/utils.py

+ 9
- 0
test/test_utils.py View File

@ -33,6 +33,7 @@ from youtube_dl.utils import (
ExtractorError, ExtractorError,
find_xpath_attr, find_xpath_attr,
fix_xml_ampersands, fix_xml_ampersands,
get_element_by_class,
InAdvancePagedList, InAdvancePagedList,
intlist_to_bytes, intlist_to_bytes,
is_html, is_html,
@ -991,5 +992,13 @@ The first line
self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(3, 1), 1)
self.assertEqual(urshift(-3, 1), 2147483646) self.assertEqual(urshift(-3, 1), 2147483646)
def test_get_element_by_class(self):
html = '''
<span class="foo bar">nice</span>
'''
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

+ 10
- 2
youtube_dl/utils.py View File

@ -310,9 +310,17 @@ def get_element_by_id(id, html):
return get_element_by_attribute('id', id, html) return get_element_by_attribute('id', id, html)
def get_element_by_attribute(attribute, value, html):
def get_element_by_class(class_name, html):
return get_element_by_attribute(
'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
html, escape_value=False)
def get_element_by_attribute(attribute, value, html, escape_value=True):
"""Return the content of the tag with the specified attribute in the passed HTML document""" """Return the content of the tag with the specified attribute in the passed HTML document"""
value = re.escape(value) if escape_value else value
m = re.search(r'''(?xs) m = re.search(r'''(?xs)
<([a-zA-Z0-9:._-]+) <([a-zA-Z0-9:._-]+)
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
@ -321,7 +329,7 @@ def get_element_by_attribute(attribute, value, html):
\s*> \s*>
(?P<content>.*?) (?P<content>.*?)
</\1> </\1>
''' % (re.escape(attribute), re.escape(value)), html)
''' % (re.escape(attribute), value), html)
if not m: if not m:
return None return None


Loading…
Cancel
Save