[moviefap] Add new extractor

10 years ago · 82ea1051b5
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -311,6 +311,7 @@ from .morningstar import MorningstarIE
 from .motherless import MotherlessIE
 from .motorsport import MotorsportIE
 from .movieclips import MovieClipsIE
 from .moviefap import MovieFapIE
 from .moviezine import MoviezineIE
 from .movshare import MovShareIE
 from .mtv import (
--- a/youtube_dl/extractor/moviefap.py
+++ b/youtube_dl/extractor/moviefap.py
@ -0,0 +1,117 @@
 from __future__ import unicode_literals

 import re

 from .common import InfoExtractor
 from ..utils import str_to_int


 class MovieFapIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<name>[a-z-_]+)'
    _TESTS = [{
        'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
        'md5': 'fa56683e291fc80635907168a743c9ad',
        'info_dict': {
            'id': 'e5da0d3edce5404418f5',
            'ext': 'flv',
            'title': 'Jeune Couple Russe',
            'description': 'Amateur',
            'thumbnail': 'http://pic.moviefap.com/thumbs/e5/949-18l.jpg',
            'uploader_id': 'whiskeyjar',
            'display_id': 'jeune-couple-russe'
        }
    }, {
        'url': 'http://www.moviefap.com/videos/3080837f6712355015c2/busty-british-blonde-takes-backdoor-in-fake-taxi.html',
        'md5': 'bedef72cb23d27a20755fc430a6d7a0e',
        'info_dict': {
            'id': '3080837f6712355015c2',
            'ext': 'mp4',
            'title': 'Busty British blonde takes backdoor in fake taxi',
            'description': 'Big boobs British blonde flashing in fake taxi then giving titsjob and rimjob in the back seat before getting big cock up her tight ass',
            'thumbnail': 'http://img.moviefap.com/a16:9w990r/thumbs/30/322021-18l.jpg',
            'uploader_id': 'momcikoper',
            'display_id': 'busty-british-blonde-takes-backdoor-in-fake-taxi'
        }
    }]

    @staticmethod
    def __get_thumbnail_data(xml):

        """
        Constructs a list of video thumbnails from timeline preview images.
        :param xml: the information XML document to parse
        """

        timeline = xml.find('timeline')
        if timeline is None:
            # not all videos have the data - ah well
            return []

        # get the required information from the XML
        attrs = {attr: str_to_int(timeline.find(attr).text)
                 for attr in ['imageWidth', 'imageHeight', 'imageFirst', 'imageLast']}
        pattern = timeline.find('imagePattern').text

        # generate the list of thumbnail information dicts
        thumbnails = []
        for i in range(attrs['imageFirst'], attrs['imageLast'] + 1):
            thumbnails.append({
                'url': pattern.replace('#', str(i)),
                'width': attrs['imageWidth'],
                'height': attrs['imageHeight']
            })
        return thumbnails

    def _real_extract(self, url):

        # find the video ID
        video_id = self._match_id(url)

        # retrieve the page HTML
        webpage = self._download_webpage(url, video_id)

        # find the URL of the XML document detailing video download URLs
        info_url = self._html_search_regex(r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters')

        # download that XML
        xml = self._download_xml(info_url, video_id)

        # create dictionary of properties we know so far, or can find easily
        info = {
            'id': video_id,
            'title': self._html_search_regex(r'<div id="view_title"><h1>(.*?)</h1>', webpage, 'title'),
            'display_id': re.compile(self._VALID_URL).match(url).group('name'),
            'thumbnails': self.__get_thumbnail_data(xml),
            'thumbnail': xml.find('startThumb').text,
            'description': self._html_search_regex(r'name="description" value="(.*?)"', webpage, 'description'),
            'uploader_id': self._html_search_regex(r'name="username" value="(.*?)"', webpage, 'uploader_id'),
            'view_count': str_to_int(self._html_search_regex(r'<br>Views <strong>([0-9]+)</strong>', webpage, 'view_count')),
            'average_rating': float(self._html_search_regex(r'Current Rating<br> <strong>(.*?)</strong>', webpage, 'average_rating')),
            'comment_count': str_to_int(self._html_search_regex(r'<span id="comCount">([0-9]+)</span>', webpage, 'comment_count')),
            'age_limit': 18,
            'webpage_url': self._html_search_regex(r'name="link" value="(.*?)"', webpage, 'webpage_url'),
            'categories': self._html_search_regex(r'</div>\s*(.*?)\s*<br>', webpage, 'categories').split(', ')
        }

        # find and add the format
        if xml.find('videoConfig') is not None:
            info['ext'] = xml.find('videoConfig').find('type').text
        else:
            info['ext'] = 'flv'  # guess...

        # work out the video URL(s)
        if xml.find('videoLink') is not None:
            # single format available
            info['url'] = xml.find('videoLink').text
        else:
            # multiple formats available
            info['formats'] = []

            # N.B. formats are already in ascending order of quality
            for item in xml.find('quality').findall('item'):
                info['formats'].append({
                    'url': item.find('videoLink').text,
                    'resolution': item.find('res').text  # 480p etc.
                })

        return info