bandcamp.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. import sys
  2. import datetime
  3. import json
  4. import logging
  5. import bs4
  6. import requests
  7. from requests.adapters import HTTPAdapter
  8. from urllib3.util import create_urllib3_context
  9. from urllib.parse import urlparse, urlunparse, urljoin
  10. from bandcamp_dl import __version__
  11. from bandcamp_dl.bandcampjson import BandcampJSON
  12. class SSLAdapter(HTTPAdapter):
  13. def __init__(self, ssl_context=None, **kwargs):
  14. self.ssl_context = ssl_context
  15. super().__init__(**kwargs)
  16. def init_poolmanager(self, *args, **kwargs):
  17. kwargs['ssl_context'] = self.ssl_context
  18. return super().init_poolmanager(*args, **kwargs)
  19. def proxy_manager_for(self, *args, **kwargs):
  20. kwargs['ssl_context'] = self.ssl_context
  21. return super().proxy_manager_for(*args, **kwargs)
  22. # Create the SSL context with the custom ciphers
  23. ctx = create_urllib3_context()
  24. ctx.load_default_certs()
  25. DEFAULT_CIPHERS = ":".join(
  26. [
  27. "ECDHE+AESGCM",
  28. "ECDHE+CHACHA20",
  29. "DHE+AESGCM",
  30. "DHE+CHACHA20",
  31. "ECDH+AESGCM",
  32. "DH+AESGCM",
  33. "ECDH+AES",
  34. "DH+AES",
  35. "RSA+AESGCM",
  36. "RSA+AES",
  37. "!aNULL",
  38. "!eNULL",
  39. "!MD5",
  40. "!DSS",
  41. "!AESCCM",
  42. ]
  43. )
  44. ctx.set_ciphers(DEFAULT_CIPHERS)
  45. class Bandcamp:
  46. def __init__(self):
  47. self.headers = {'User-Agent': f'bandcamp-dl/{__version__} '
  48. f'(https://github.com/evolution0/bandcamp-dl)'}
  49. self.soup = None
  50. self.tracks = None
  51. self.logger = logging.getLogger("bandcamp-dl").getChild("Main")
  52. # Mount the adapter with the custom SSL context to the session
  53. self.session = requests.Session()
  54. self.adapter = SSLAdapter(ssl_context=ctx)
  55. self.session.mount('https://', self.adapter)
  56. def parse(self, url: str, art: bool = True, lyrics: bool = False, genres: bool = False,
  57. debugging: bool = False, cover_quality: int = 0) -> dict or None:
  58. """Requests the page, cherry-picks album info
  59. :param url: album/track url
  60. :param art: if True download album art
  61. :param lyrics: if True fetch track lyrics
  62. :param genres: if True fetch track tags
  63. :param debugging: if True then verbose output
  64. :return: album metadata
  65. """
  66. try:
  67. response = self.session.get(url, headers=self.headers)
  68. except requests.exceptions.MissingSchema:
  69. return None
  70. if not response.ok:
  71. self.logger.debug(" Status code: %s", response.status_code)
  72. print(f"The Album/Track requested does not exist at: {url}")
  73. sys.exit(2)
  74. try:
  75. self.soup = bs4.BeautifulSoup(response.text, "lxml")
  76. except bs4.FeatureNotFound:
  77. self.soup = bs4.BeautifulSoup(response.text, "html.parser")
  78. self.logger.debug(" Generating BandcampJSON..")
  79. bandcamp_json = BandcampJSON(self.soup, debugging).generate()
  80. page_json = {}
  81. for entry in bandcamp_json:
  82. page_json = {**page_json, **json.loads(entry)}
  83. self.logger.debug(" BandcampJSON generated..")
  84. self.logger.debug(" Generating Album..")
  85. self.tracks = page_json['trackinfo']
  86. track_ids = {}
  87. if 'track' in page_json and 'itemListElement' in page_json['track']:
  88. for item in page_json['track']['itemListElement']:
  89. track_url = item['item']['@id']
  90. for prop in item['item'].get('additionalProperty', []):
  91. if prop.get('name') == 'track_id':
  92. track_ids[track_url] = prop.get('value')
  93. break
  94. track_nums = [track['track_num'] for track in self.tracks]
  95. if len(track_nums) != len(set(track_nums)):
  96. self.logger.debug(" Duplicate track numbers found, re-numbering based on position..")
  97. track_positions = {}
  98. if 'track' in page_json and 'itemListElement' in page_json['track']:
  99. for item in page_json['track']['itemListElement']:
  100. track_url = item['item']['@id']
  101. position = item['position']
  102. track_positions[track_url] = position
  103. if "/track/" in page_json['url']:
  104. artist_url = page_json['url'].rpartition('/track/')[0]
  105. else:
  106. artist_url = page_json['url'].rpartition('/album/')[0]
  107. for track in self.tracks:
  108. full_track_url = f"{artist_url}{track['title_link']}"
  109. if full_track_url in track_positions:
  110. track['track_num'] = track_positions[full_track_url]
  111. else:
  112. self.logger.debug(f" Could not find position for track: {full_track_url}")
  113. track['track_num'] = self.tracks.index(track) + 1
  114. album_release = page_json['album_release_date']
  115. if album_release is None:
  116. album_release = page_json['current']['release_date']
  117. if album_release is None:
  118. album_release = page_json['embed_info']['item_public']
  119. try:
  120. album_title = page_json['current']['title']
  121. except KeyError:
  122. album_title = page_json['trackinfo'][0]['title']
  123. try:
  124. label = page_json['item_sellers'][f'{page_json["current"]["selling_band_id"]}']['name']
  125. except KeyError:
  126. label = None
  127. album_id = None
  128. track_id_from_music_recording = None
  129. if page_json.get('@type') == 'MusicRecording':
  130. if 'additionalProperty' in page_json:
  131. for prop in page_json['additionalProperty']:
  132. if prop.get('name') == 'track_id':
  133. track_id_from_music_recording = prop.get('value')
  134. album_id = track_id_from_music_recording
  135. self.logger.debug(f" Single track page, found track_id: {track_id_from_music_recording}")
  136. break
  137. elif page_json.get('@type') == 'MusicAlbum':
  138. if 'albumRelease' in page_json:
  139. for release in page_json['albumRelease']:
  140. if 'additionalProperty' in release:
  141. for prop in release['additionalProperty']:
  142. if prop.get('name') == 'item_id':
  143. album_id = prop.get('value')
  144. self.logger.debug(f" Album page, found album_id: {album_id}")
  145. break
  146. if album_id:
  147. break
  148. album = {
  149. "tracks": [],
  150. "title": album_title,
  151. "artist": page_json['artist'],
  152. "label": label,
  153. "full": False,
  154. "art": "",
  155. "date": str(datetime.datetime.strptime(album_release, "%d %b %Y %H:%M:%S GMT").year),
  156. "url": url,
  157. "genres": "",
  158. "album_id": album_id
  159. }
  160. if "/track/" in page_json['url']:
  161. artist_url = page_json['url'].rpartition('/track/')[0]
  162. else:
  163. artist_url = page_json['url'].rpartition('/album/')[0]
  164. for track in self.tracks:
  165. full_track_url = f"{artist_url}{track['title_link']}"
  166. if track_id_from_music_recording:
  167. track['track_id'] = track_id_from_music_recording
  168. else:
  169. track['track_id'] = track_ids.get(full_track_url)
  170. if lyrics:
  171. track['lyrics'] = self.get_track_lyrics(f"{artist_url}"
  172. f"{track['title_link']}#lyrics")
  173. if track['file'] is not None:
  174. track = self.get_track_metadata(track)
  175. album['tracks'].append(track)
  176. album['full'] = self.all_tracks_available()
  177. if art:
  178. album['art'] = self.get_album_art(cover_quality)
  179. if genres:
  180. album['genres'] = "; ".join(page_json['keywords'])
  181. self.logger.debug(" Album generated..")
  182. self.logger.debug(" Album URL: %s", album['url'])
  183. return album
  184. def get_track_lyrics(self, track_url):
  185. self.logger.debug(" Fetching track lyrics..")
  186. track_page = self.session.get(track_url, headers=self.headers)
  187. try:
  188. track_soup = bs4.BeautifulSoup(track_page.text, "lxml")
  189. except bs4.FeatureNotFound:
  190. track_soup = bs4.BeautifulSoup(track_page.text, "html.parser")
  191. track_lyrics = track_soup.find("div", {"class": "lyricsText"})
  192. if track_lyrics:
  193. self.logger.debug(" Lyrics retrieved..")
  194. return track_lyrics.text
  195. else:
  196. self.logger.debug(" Lyrics not found..")
  197. return ""
  198. def all_tracks_available(self) -> bool:
  199. """Verify that all tracks have a url
  200. :return: True if all urls accounted for
  201. """
  202. for track in self.tracks:
  203. if track['file'] is None:
  204. return False
  205. return True
  206. def get_track_metadata(self, track: dict or None) -> dict:
  207. """Extract individual track metadata
  208. :param track: track dict
  209. :return: track metadata dict
  210. """
  211. self.logger.debug(" Generating track metadata..")
  212. track_metadata = {
  213. "duration": track['duration'],
  214. "track": str(track['track_num']),
  215. "title": track['title'],
  216. "artist": track['artist'],
  217. "track_id": track.get('track_id'),
  218. "url": None
  219. }
  220. if 'mp3-128' in track['file']:
  221. if 'https' in track['file']['mp3-128']:
  222. track_metadata['url'] = track['file']['mp3-128']
  223. else:
  224. track_metadata['url'] = "http:" + track['file']['mp3-128']
  225. else:
  226. track_metadata['url'] = None
  227. if track['has_lyrics'] is not False:
  228. if track['lyrics'] is not None:
  229. track_metadata['lyrics'] = track['lyrics'].replace('\\r\\n', '\n')
  230. self.logger.debug(" Track metadata generated..")
  231. return track_metadata
  232. @staticmethod
  233. def generate_album_url(artist: str, slug: str, page_type: str) -> str:
  234. """Generate an album url based on the artist and album name
  235. :param artist: artist name
  236. :param slug: Slug of album/track
  237. :param page_type: Type of page album/track
  238. :return: url as str
  239. """
  240. return f"http://{artist}.bandcamp.com/{page_type}/{slug}"
  241. def get_album_art(self, quality: int = 0) -> str:
  242. """Find and retrieve album art url from page
  243. :param quality: The quality of the album art to retrieve
  244. :return: url as str
  245. """
  246. try:
  247. url = self.soup.find(id='tralbumArt').find_all('a')[0]['href']
  248. return f"{url[:-6]}{quality}{url[-4:]}"
  249. except None:
  250. pass
  251. def get_full_discography(self, artist: str, page_type: str) -> list:
  252. """Generate a list of album and track urls based on the artist name
  253. :param artist: artist name
  254. :param page_type: Type of page, it should be music but it's a parameter so it's not
  255. hardcoded
  256. :return: urls as list of strs
  257. """
  258. album_urls = set()
  259. music_page_url = f"https://{artist}.bandcamp.com/{page_type}"
  260. self.logger.info(f"Scraping discography from: {music_page_url}")
  261. try:
  262. response = self.session.get(music_page_url, headers=self.headers)
  263. except requests.exceptions.RequestException as e:
  264. self.logger.error(f"Could not fetch artist page {music_page_url}: {e}")
  265. return []
  266. try:
  267. soup = bs4.BeautifulSoup(response.text, "lxml")
  268. except bs4.FeatureNotFound:
  269. soup = bs4.BeautifulSoup(response.text, "html.parser")
  270. music_grid = soup.find('ol', {'id': 'music-grid'})
  271. if not music_grid:
  272. self.logger.warning("Could not find music grid on the page. No albums found.")
  273. return []
  274. if 'data-client-items' in music_grid.attrs:
  275. self.logger.debug("Found data-client-items attribute. Parsing for album URLs.")
  276. try:
  277. json_string = bs4.BeautifulSoup(music_grid['data-client-items'], "html.parser").text
  278. items = json.loads(json_string)
  279. for item in items:
  280. if 'page_url' in item:
  281. full_url = urljoin(music_page_url, item['page_url'])
  282. album_urls.add(full_url)
  283. except (json.JSONDecodeError, TypeError) as e:
  284. self.logger.error(f"Failed to parse data-client-items JSON: {e}")
  285. self.logger.debug("Scraping all <li> elements in the music grid for links.")
  286. for a in music_grid.select('li.music-grid-item a'):
  287. href = a.get('href')
  288. if href:
  289. full_url = urljoin(music_page_url, href)
  290. album_urls.add(full_url)
  291. self.logger.info(f"Found a total of {len(album_urls)} unique album/track links.")
  292. return list(album_urls)