[minhateca] Add extractor (Fixes #4094)

10 years ago · 4349c07dd7
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -376,6 +376,7 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(parse_filesize('2 MiB'), 2097152)
        self.assertEqual(parse_filesize('5 GB'), 5000000000)
        self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
        self.assertEqual(parse_filesize('1,24 KB'), 1240)

 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -217,6 +217,7 @@ from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .mgoon import MgoonIE
 from .minhateca import MinhatecaIE
 from .ministrygrid import MinistryGridIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mitele import MiTeleIE
--- a/youtube_dl/extractor/minhateca.py
+++ b/youtube_dl/extractor/minhateca.py
@ -0,0 +1,71 @@
 # coding: utf-8
 from __future__ import unicode_literals

 from .common import InfoExtractor
 from ..compat import (
    compat_urllib_parse,
    compat_urllib_request,
 )
 from ..utils import (
    int_or_none,
    parse_filesize,
 )


 class MinhatecaIE(InfoExtractor):
    _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
    _TEST = {
        'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
        'info_dict': {
            'id': '125848331',
            'ext': 'mp4',
            'title': 'youtube-dl test video',
            'thumbnail': 're:^https?://.*\.jpg$',
            'filesize_approx': 1530000,
            'duration': 9,
            'view_count': int,
        }
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        token = self._html_search_regex(
            r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
            webpage, 'request token')
        token_data = [
            ('fileId', video_id),
            ('__RequestVerificationToken', token),
        ]
        req = compat_urllib_request.Request(
            'http://minhateca.com.br/action/License/Download',
            data=compat_urllib_parse.urlencode(token_data))
        req.add_header('Content-Type', 'application/x-www-form-urlencoded')
        data = self._download_json(
            req, video_id, note='Downloading metadata')

        video_url = data['redirectUrl']
        title_str = self._html_search_regex(
            r'<h1.*?>(.*?)</h1>', webpage, 'title')
        title, _, ext = title_str.rpartition('.')
        filesize_approx = parse_filesize(self._html_search_regex(
            r'<p class="fileSize">(.*?)</p>',
            webpage, 'file size approximation', fatal=False))
        duration = int_or_none(self._html_search_regex(
            r'(?s)<p class="fileLeng[ht][th]">.*?([0-9]+)\s*s',
            webpage, 'duration', fatal=False))
        view_count = int_or_none(self._html_search_regex(
            r'<p class="downloadsCounter">([0-9]+)</p>',
            webpage, 'view count', fatal=False))

        return {
            'id': video_id,
            'url': video_url,
            'title': title,
            'ext': ext,
            'filesize_approx': filesize_approx,
            'duration': duration,
            'view_count': view_count,
            'thumbnail': self._og_search_thumbnail(webpage),
        }
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -1090,11 +1090,14 @@ def parse_filesize(s):
    }

    units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
    m = re.match(r'(?P<num>[0-9]+(?:\.[0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
    m = re.match(
        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
    if not m:
        return None

    return int(float(m.group('num')) * _UNIT_TABLE[m.group('unit')])
    num_str = m.group('num').replace(',', '.')
    mult = _UNIT_TABLE[m.group('unit')]
    return int(float(num_str) * mult)


 def get_term_width():