music-kraken-core/music_kraken/objects/source.py

238 lines
7.6 KiB
Python
Raw Permalink Normal View History

from __future__ import annotations
2023-03-09 21:14:39 +00:00
from collections import defaultdict
2023-01-12 15:25:50 +00:00
from enum import Enum
2024-05-15 10:18:08 +00:00
from typing import (
List,
Dict,
Set,
Tuple,
Optional,
Iterable,
Generator,
TypedDict,
Callable,
Any,
TYPE_CHECKING
)
2024-04-29 12:40:49 +00:00
from urllib.parse import urlparse, ParseResult
from dataclasses import dataclass, field
from functools import cached_property
2023-01-12 15:25:50 +00:00
2024-04-29 15:36:43 +00:00
from ..utils import generate_id
2024-05-15 11:16:11 +00:00
from ..utils.enums import SourceType, ALL_SOURCE_TYPES
2023-09-10 14:27:09 +00:00
from ..utils.config import youtube_settings
2024-04-29 15:06:31 +00:00
from ..utils.string_processing import hash_url, shorten_display_url
2023-09-10 14:27:09 +00:00
2023-03-10 08:09:35 +00:00
from .metadata import Mapping, Metadata
2024-05-15 10:18:08 +00:00
if TYPE_CHECKING:
from ..pages.abstract import Page
2023-01-12 15:25:50 +00:00
2023-01-20 22:05:15 +00:00
@dataclass
2024-04-29 12:40:49 +00:00
class Source:
2024-05-14 13:18:17 +00:00
source_type: SourceType
2024-04-29 15:06:31 +00:00
url: str
2024-05-14 13:18:17 +00:00
referrer_page: SourceType = None
2024-04-29 15:06:31 +00:00
audio_url: Optional[str] = None
2023-12-19 21:11:46 +00:00
2024-04-29 12:40:49 +00:00
additional_data: dict = field(default_factory=dict)
2023-12-19 21:11:46 +00:00
2024-04-29 12:40:49 +00:00
def __post_init__(self):
2024-05-14 13:18:17 +00:00
self.referrer_page = self.referrer_page or self.source_type
2023-01-12 15:25:50 +00:00
@classmethod
2024-05-14 13:18:17 +00:00
def match_url(cls, url: str, referrer_page: SourceType) -> Optional[Source]:
"""
2024-04-29 15:29:55 +00:00
this shouldn't be used, unless you are not certain what the source is for
the reason is that it is more inefficient
"""
2024-04-29 15:29:55 +00:00
parsed_url = urlparse(url)
url = parsed_url.geturl()
2023-03-30 10:31:37 +00:00
2024-04-29 15:29:55 +00:00
if "musify" in parsed_url.netloc:
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.MUSIFY, url, referrer_page=referrer_page)
2024-04-29 15:29:55 +00:00
if parsed_url.netloc in [_url.netloc for _url in youtube_settings['youtube_url']]:
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.YOUTUBE, url, referrer_page=referrer_page)
if url.startswith("https://www.deezer"):
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.DEEZER, url, referrer_page=referrer_page)
if url.startswith("https://open.spotify.com"):
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.SPOTIFY, url, referrer_page=referrer_page)
if "bandcamp" in url:
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.BANDCAMP, url, referrer_page=referrer_page)
2024-04-29 15:29:55 +00:00
if "wikipedia" in parsed_url.netloc:
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.WIKIPEDIA, url, referrer_page=referrer_page)
2023-03-18 16:06:12 +00:00
if url.startswith("https://www.metal-archives.com/"):
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.ENCYCLOPAEDIA_METALLUM, url, referrer_page=referrer_page)
# the less important once
if url.startswith("https://www.facebook"):
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.FACEBOOK, url, referrer_page=referrer_page)
if url.startswith("https://www.instagram"):
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.INSTAGRAM, url, referrer_page=referrer_page)
if url.startswith("https://twitter"):
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.TWITTER, url, referrer_page=referrer_page)
if url.startswith("https://myspace.com"):
2024-05-15 11:16:11 +00:00
return cls(ALL_SOURCE_TYPES.MYSPACE, url, referrer_page=referrer_page)
2024-05-15 09:44:39 +00:00
@property
def has_page(self) -> bool:
return self.source_type.page is not None
@property
2024-05-15 10:18:08 +00:00
def page(self) -> Page:
2024-05-15 09:44:39 +00:00
return self.source_type.page
@property
def parsed_url(self) -> ParseResult:
return urlparse(self.url)
@property
def hash_url(self) -> str:
return hash_url(self.url)
2023-03-10 08:09:35 +00:00
@property
2024-04-29 15:06:31 +00:00
def indexing_values(self) -> list:
r = [hash_url(self.url)]
if self.audio_url:
r.append(hash_url(self.audio_url))
return r
2023-01-12 15:25:50 +00:00
2023-01-30 13:41:02 +00:00
def __repr__(self) -> str:
2024-05-14 13:18:17 +00:00
return f"Src({self.source_type.value}: {shorten_display_url(self.url)})"
2023-01-30 13:41:02 +00:00
2024-04-29 20:37:07 +00:00
def __merge__(self, other: Source, **kwargs):
2024-04-29 15:06:31 +00:00
if self.audio_url is None:
self.audio_url = other.audio_url
self.additional_data.update(other.additional_data)
2024-05-14 13:18:17 +00:00
page_str = property(fget=lambda self: self.source_type.value)
2023-01-25 13:14:15 +00:00
2024-05-15 09:44:39 +00:00
class SourceTypeSorting(TypedDict):
sort_key: Callable[[SourceType], Any]
reverse: bool
only_with_page: bool
2024-04-29 12:40:49 +00:00
class SourceCollection:
2024-04-29 15:36:43 +00:00
__change_version__ = generate_id()
2024-04-29 15:06:31 +00:00
_indexed_sources: Dict[str, Source]
2024-05-15 09:44:39 +00:00
_sources_by_type: Dict[SourceType, List[Source]]
2024-04-29 12:40:49 +00:00
2023-12-29 14:43:33 +00:00
def __init__(self, data: Optional[Iterable[Source]] = None, **kwargs):
2024-05-15 11:16:11 +00:00
self._sources_by_type = defaultdict(list)
2024-04-29 15:06:31 +00:00
self._indexed_sources = {}
self.extend(data or [])
2023-03-09 21:14:39 +00:00
2024-05-15 09:44:39 +00:00
def source_types(
self,
only_with_page: bool = False,
sort_key = lambda page: page.name,
reverse: bool = False
) -> Iterable[SourceType]:
"""
Returns a list of all source types contained in this source collection.
Args:
only_with_page (bool, optional): If True, only returns source types that have a page, meaning you can download from them.
sort_key (function, optional): A function that defines the sorting key for the source types. Defaults to lambda page: page.name.
reverse (bool, optional): If True, sorts the source types in reverse order. Defaults to False.
Returns:
Iterable[SourceType]: A list of source types.
"""
2024-05-15 11:16:11 +00:00
source_types: List[SourceType] = self._sources_by_type.keys()
2024-05-15 09:44:39 +00:00
if only_with_page:
source_types = filter(lambda st: st.has_page, source_types)
2024-04-29 15:19:09 +00:00
2024-05-15 09:44:39 +00:00
return sorted(
source_types,
key=sort_key,
reverse=reverse
)
2024-04-29 15:06:31 +00:00
2024-05-15 09:44:39 +00:00
def get_sources(self, *source_types: List[SourceType], source_type_sorting: SourceTypeSorting = None) -> Generator[Source]:
"""
Retrieves sources based on the provided source types and source type sorting.
Args:
*source_types (List[Source]): Variable number of source types to filter the sources.
source_type_sorting (SourceTypeSorting): Sorting criteria for the source types. This is only relevant if no source types are provided.
Yields:
Generator[Source]: A generator that yields the sources based on the provided filters.
Returns:
None
"""
if not len(source_types):
source_type_sorting = source_type_sorting or {}
source_types = self.source_types(**source_type_sorting)
for source_type in source_types:
2024-05-15 11:16:11 +00:00
yield from self._sources_by_type[source_type]
2023-03-10 09:13:35 +00:00
2024-04-29 12:40:49 +00:00
def append(self, source: Source):
if source is None:
return
2024-04-29 15:06:31 +00:00
existing_source = None
for key in source.indexing_values:
if key in self._indexed_sources:
existing_source = self._indexed_sources[key]
break
if existing_source is not None:
existing_source.__merge__(source)
source = existing_source
2024-04-29 15:27:12 +00:00
else:
2024-05-15 11:16:11 +00:00
self._sources_by_type[source.source_type].append(source)
2024-04-29 15:06:31 +00:00
2024-04-29 15:36:43 +00:00
changed = False
2024-04-29 15:06:31 +00:00
for key in source.indexing_values:
2024-04-29 15:36:43 +00:00
if key not in self._indexed_sources:
changed = True
2024-04-29 15:06:31 +00:00
self._indexed_sources[key] = source
2023-03-09 21:14:39 +00:00
2024-04-29 15:36:43 +00:00
if changed:
self.__change_version__ = generate_id()
2024-04-29 12:40:49 +00:00
def extend(self, sources: Iterable[Source]):
for source in sources:
self.append(source)
2024-04-29 15:06:31 +00:00
def __iter__(self):
yield from self.get_sources()
2024-04-29 20:37:07 +00:00
def __merge__(self, other: SourceCollection, **kwargs):
2024-04-29 15:06:31 +00:00
self.extend(other)
2024-04-29 15:36:43 +00:00
@property
def hash_url_list(self) -> List[str]:
return [hash_url(source.url) for source in self.get_sources()]
2024-04-29 15:06:31 +00:00
@property
def url_list(self) -> List[str]:
2024-04-29 15:19:09 +00:00
return [source.url for source in self.get_sources()]
2024-04-29 15:06:31 +00:00
@property
def homepage_list(self) -> List[str]:
2024-05-15 09:44:39 +00:00
return [source_type.homepage for source_type in self._sources_by_type.keys()]
2024-04-29 16:18:57 +00:00
def indexing_values(self) -> Generator[Tuple[str, str], None, None]:
for index in self._indexed_sources:
yield "url", index