[googledrive] Add new extractor

10 years ago · 984e4d4875
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -209,6 +209,7 @@ from .globo import GloboIE
 from .godtube import GodTubeIE
 from .goldenmoustache import GoldenMoustacheIE
 from .golem import GolemIE
 from .googledrive import GoogleDriveIE
 from .googleplus import GooglePlusIE
 from .googlesearch import GoogleSearchIE
 from .gorillavid import GorillaVidIE
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@ -0,0 +1,106 @@
 from .common import InfoExtractor
 from ..utils import RegexNotFoundError

 class GoogleDriveIE(InfoExtractor):
    _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)'
    _TEST = {
        'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1',
        'info_dict': {
            'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U',
            'ext': 'mp4',
            'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4',
        }
    }
    _formats = {
        '5': {'ext': 'flv'},
        '6': {'ext': 'flv'},
        '13': {'ext': '3gp'},
        '17': {'ext': '3gp'},
        '18': {'ext': 'mp4'},
        '22': {'ext': 'mp4'},
        '34': {'ext': 'flv'},
        '35': {'ext': 'flv'},
        '36': {'ext': '3gp'},
        '37': {'ext': 'mp4'},
        '38': {'ext': 'mp4'},
        '43': {'ext': 'webm'},
        '44': {'ext': 'webm'},
        '45': {'ext': 'webm'},
        '46': {'ext': 'webm'},
        '59': {'ext': 'mp4'}
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(
            'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape'
        )
        try:
            title = self._html_search_regex(
                r'"title","(?P<title>.*?)"',
                webpage,
                'title',
                group='title'
            )
            fmt_stream_map = self._html_search_regex(
                r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"',
                webpage,
                'fmt_stream_map',
                group='fmt_stream_map'
            )
            fmt_list = self._html_search_regex(
                r'"fmt_list","(?P<fmt_list>.*?)"',
                webpage,
                'fmt_list',
                group='fmt_list'
            )
 #			timestamp = self._html_search_regex(
 #				r'"timestamp","(?P<timestamp>.*?)"',
 #				webpage,
 #				'timestamp',
 #				group='timestamp'
 #			)
            length_seconds = self._html_search_regex(
                r'"length_seconds","(?P<length_seconds>.*?)"',
                webpage,
                'length_seconds',
                group='length_seconds'
            )
        except RegexNotFoundError:
            try:
                reason = self._html_search_regex(
                    r'"reason","(?P<reason>.*?)"',
                    webpage,
                    'reason',
                    group='reason'
                )
                self.report_warning(reason)
                return
            except RegexNotFoundError:
                self.report_warning('not a video')
                return

        fmt_stream_map = fmt_stream_map.split(',')
        fmt_list = fmt_list.split(',')
        formats = []
        for i in range(len(fmt_stream_map)):
            fmt_id, fmt_url = fmt_stream_map[i].split('|')
            resolution = fmt_list[i].split('/')[1]
            width, height = resolution.split('x')
            formats.append({
                'url': fmt_url,
                'format_id': fmt_id,
                'resolution': resolution,
                'width': int(width),
                'height': int(height),
                'ext': self._formats[fmt_id]['ext']
            })
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
 #           'timestamp': int(timestamp),
            'duration': int(length_seconds),
            'formats': formats
        }