[soundcloud:user] Rework extractor (Closes #6399)

10 years ago · 80fb6d4aa4
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):
    _VALID_URL = r'''(?x)^(?:https?://)?
                    (?:(?:(?:www\.|m\.)?soundcloud\.com/
                            (?P<uploader>[\w\d-]+)/
                            (?!sets/|(?:likes|tracks)/?(?:$|[?#]))
                            (?!(?:tracks|sets|reposts|likes|spotlight)/?(?:$|[?#]))
                            (?P<title>[\w\d-]+)/?
                            (?P<token>[^?]+?)?(?:[?].*)?$)
                       |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
@ -293,60 +293,131 @@ class SoundcloudSetIE(SoundcloudIE):


 class SoundcloudUserIE(SoundcloudIE):
    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'
    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|sets|reposts|likes|spotlight)/?)?(\?.*)?$'
    IE_NAME = 'soundcloud:user'
    _TESTS = [{
        'url': 'https://soundcloud.com/the-concept-band',
        'url': 'https://soundcloud.com/the-akashic-chronicler',
        'info_dict': {
            'id': '9615865',
            'title': 'The Royal Concept',
            'id': '114582580',
            'title': 'The Akashic Chronicler (All)',
        },
        'playlist_mincount': 12
        'playlist_mincount': 112,
    }, {
        'url': 'https://soundcloud.com/the-concept-band/likes',
        'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
        'info_dict': {
            'id': '9615865',
            'title': 'The Royal Concept',
            'id': '114582580',
            'title': 'The Akashic Chronicler (Tracks)',
        },
        'playlist_mincount': 1,
        'playlist_mincount': 50,
    }, {
        'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
        'only_matching': True,
        'url': 'https://soundcloud.com/the-akashic-chronicler/sets',
        'info_dict': {
            'id': '114582580',
            'title': 'The Akashic Chronicler (Playlists)',
        },
        'playlist_mincount': 3,
    }, {
        'url': 'https://soundcloud.com/the-akashic-chronicler/reposts',
        'info_dict': {
            'id': '114582580',
            'title': 'The Akashic Chronicler (Reposts)',
        },
        'playlist_mincount': 9,
    }, {
        'url': 'https://soundcloud.com/the-akashic-chronicler/likes',
        'info_dict': {
            'id': '114582580',
            'title': 'The Akashic Chronicler (Likes)',
        },
        'playlist_mincount': 333,
    }, {
        'url': 'https://soundcloud.com/grynpyret/spotlight',
        'info_dict': {
            'id': '7098329',
            'title': 'Grynpyret (Spotlight)',
        },
        'playlist_mincount': 1,
    }]

    _API_BASE = 'https://api.soundcloud.com'
    _API_V2_BASE = 'https://api-v2.soundcloud.com'

    _BASE_URL_MAP = {
        'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE,
        'tracks': '%s/users/%%s/tracks' % _API_BASE,
        'sets': '%s/users/%%s/playlists' % _API_V2_BASE,
        'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE,
        'likes': '%s/users/%%s/likes' % _API_V2_BASE,
        'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE,
    }

    _TITLE_MAP = {
        'all': 'All',
        'tracks': 'Tracks',
        'sets': 'Playlists',
        'reposts': 'Reposts',
        'likes': 'Likes',
        'spotlight': 'Spotlight',
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        uploader = mobj.group('user')
        resource = mobj.group('rsrc')
        if resource is None:
            resource = 'tracks'
        elif resource == 'likes':
            resource = 'favorites'

        url = 'http://soundcloud.com/%s/' % uploader
        resolv_url = self._resolv_url(url)
        user = self._download_json(
            resolv_url, uploader, 'Downloading user info')
        base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource)

        resource = mobj.group('rsrc') or 'all'
        base_url = self._BASE_URL_MAP[resource] % user['id']

        next_href = None

        entries = []
        for i in itertools.count():
            data = compat_urllib_parse.urlencode({
                'offset': i * 50,
                'limit': 50,
                'client_id': self._CLIENT_ID,
            })
            new_entries = self._download_json(
                base_url + data, uploader, 'Downloading track page %s' % (i + 1))
            if len(new_entries) == 0:
            if not next_href:
                data = compat_urllib_parse.urlencode({
                    'offset': i * 50,
                    'limit': 50,
                    'client_id': self._CLIENT_ID,
                    'linked_partitioning': '1',
                    'representation': 'speedy',
                })
                next_href = base_url + '?' + data

            response = self._download_json(
                next_href, uploader, 'Downloading track page %s' % (i + 1))

            collection = response['collection']

            if not collection:
                self.to_screen('%s: End page received' % uploader)
                break
            entries.extend(self.url_result(e['permalink_url'], 'Soundcloud') for e in new_entries)

            def resolve_permalink_url(candidates):
                for cand in candidates:
                    if isinstance(cand, dict):
                        permalink_url = cand.get('permalink_url')
                        if permalink_url and permalink_url.startswith('http'):
                            return permalink_url

            for e in collection:
                permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
                if permalink_url:
                    entries.append(self.url_result(permalink_url))

            if 'next_href' in response:
                next_href = response['next_href']
                if not next_href:
                    break
            else:
                next_href = None

        return {
            '_type': 'playlist',
            'id': compat_str(user['id']),
            'title': user['username'],
            'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]),
            'entries': entries,
        }