2024-04-16 11:23:20 +00:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2023-03-09 21:14:39 +00:00
|
|
|
from collections import defaultdict
|
2023-01-12 15:25:50 +00:00
|
|
|
from enum import Enum
|
2024-04-29 12:40:49 +00:00
|
|
|
from typing import List, Dict, Set, Tuple, Optional, Iterable, Generator
|
|
|
|
from urllib.parse import urlparse, ParseResult
|
|
|
|
from dataclasses import dataclass, field
|
|
|
|
from functools import cached_property
|
2023-01-12 15:25:50 +00:00
|
|
|
|
2024-04-29 15:36:43 +00:00
|
|
|
from ..utils import generate_id
|
2023-04-18 09:18:17 +00:00
|
|
|
from ..utils.enums.source import SourcePages, SourceTypes
|
2023-09-10 14:27:09 +00:00
|
|
|
from ..utils.config import youtube_settings
|
2024-04-29 15:06:31 +00:00
|
|
|
from ..utils.string_processing import hash_url, shorten_display_url
|
2023-09-10 14:27:09 +00:00
|
|
|
|
2023-03-10 08:09:35 +00:00
|
|
|
from .metadata import Mapping, Metadata
|
2023-12-19 21:11:46 +00:00
|
|
|
from .parents import OuterProxy
|
2023-03-09 21:14:39 +00:00
|
|
|
from .collection import Collection
|
2023-01-12 15:25:50 +00:00
|
|
|
|
2023-01-20 22:05:15 +00:00
|
|
|
|
2024-04-29 11:51:43 +00:00
|
|
|
|
|
|
|
@dataclass
|
2024-04-29 12:40:49 +00:00
|
|
|
class Source:
|
2023-12-19 21:11:46 +00:00
|
|
|
page_enum: SourcePages
|
2024-04-29 15:06:31 +00:00
|
|
|
url: str
|
|
|
|
referrer_page: SourcePages = None
|
|
|
|
audio_url: Optional[str] = None
|
2023-12-19 21:11:46 +00:00
|
|
|
|
2024-04-29 12:40:49 +00:00
|
|
|
additional_data: dict = field(default_factory=dict)
|
2023-12-19 21:11:46 +00:00
|
|
|
|
2024-04-29 12:40:49 +00:00
|
|
|
def __post_init__(self):
|
|
|
|
self.referrer_page = self.referrer_page or self.page_enum
|
|
|
|
|
2024-04-30 15:43:00 +00:00
|
|
|
@property
|
2024-04-29 12:40:49 +00:00
|
|
|
def parsed_url(self) -> ParseResult:
|
|
|
|
return urlparse(self.url)
|
2023-01-12 15:25:50 +00:00
|
|
|
|
2023-02-01 08:10:05 +00:00
|
|
|
@classmethod
|
2024-04-29 15:06:31 +00:00
|
|
|
def match_url(cls, url: str, referrer_page: SourcePages) -> Optional[Source]:
|
2023-02-01 08:10:05 +00:00
|
|
|
"""
|
2024-04-29 15:29:55 +00:00
|
|
|
this shouldn't be used, unless you are not certain what the source is for
|
2023-02-01 08:10:05 +00:00
|
|
|
the reason is that it is more inefficient
|
|
|
|
"""
|
2024-04-29 15:29:55 +00:00
|
|
|
parsed_url = urlparse(url)
|
|
|
|
url = parsed_url.geturl()
|
2023-03-30 10:31:37 +00:00
|
|
|
|
2024-04-29 15:29:55 +00:00
|
|
|
if "musify" in parsed_url.netloc:
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.MUSIFY, url, referrer_page=referrer_page)
|
2023-03-18 12:01:27 +00:00
|
|
|
|
2024-04-29 15:29:55 +00:00
|
|
|
if parsed_url.netloc in [_url.netloc for _url in youtube_settings['youtube_url']]:
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.YOUTUBE, url, referrer_page=referrer_page)
|
2023-02-01 08:10:05 +00:00
|
|
|
|
|
|
|
if url.startswith("https://www.deezer"):
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.DEEZER, url, referrer_page=referrer_page)
|
2023-02-01 08:10:05 +00:00
|
|
|
|
|
|
|
if url.startswith("https://open.spotify.com"):
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.SPOTIFY, url, referrer_page=referrer_page)
|
2023-02-01 08:10:05 +00:00
|
|
|
|
|
|
|
if "bandcamp" in url:
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.BANDCAMP, url, referrer_page=referrer_page)
|
2023-02-01 08:10:05 +00:00
|
|
|
|
2024-04-29 15:29:55 +00:00
|
|
|
if "wikipedia" in parsed_url.netloc:
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.WIKIPEDIA, url, referrer_page=referrer_page)
|
2023-03-18 16:06:12 +00:00
|
|
|
|
2023-02-01 08:10:05 +00:00
|
|
|
if url.startswith("https://www.metal-archives.com/"):
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.ENCYCLOPAEDIA_METALLUM, url, referrer_page=referrer_page)
|
2023-02-01 08:10:05 +00:00
|
|
|
|
|
|
|
# the less important once
|
|
|
|
if url.startswith("https://www.facebook"):
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.FACEBOOK, url, referrer_page=referrer_page)
|
2023-02-01 08:10:05 +00:00
|
|
|
|
|
|
|
if url.startswith("https://www.instagram"):
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.INSTAGRAM, url, referrer_page=referrer_page)
|
2023-02-01 08:10:05 +00:00
|
|
|
|
|
|
|
if url.startswith("https://twitter"):
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.TWITTER, url, referrer_page=referrer_page)
|
2023-02-01 08:10:05 +00:00
|
|
|
|
2023-03-18 12:01:27 +00:00
|
|
|
if url.startswith("https://myspace.com"):
|
2024-04-29 11:51:43 +00:00
|
|
|
return cls(SourcePages.MYSPACE, url, referrer_page=referrer_page)
|
2023-03-18 12:01:27 +00:00
|
|
|
|
2024-04-10 09:20:49 +00:00
|
|
|
@property
|
|
|
|
def hash_url(self) -> str:
|
2024-04-10 14:39:46 +00:00
|
|
|
return hash_url(self.url)
|
2024-04-10 09:20:49 +00:00
|
|
|
|
2023-03-10 08:09:35 +00:00
|
|
|
@property
|
2024-04-29 15:06:31 +00:00
|
|
|
def indexing_values(self) -> list:
|
|
|
|
r = [hash_url(self.url)]
|
|
|
|
if self.audio_url:
|
|
|
|
r.append(hash_url(self.audio_url))
|
|
|
|
return r
|
2023-01-12 15:25:50 +00:00
|
|
|
|
2023-01-30 13:41:02 +00:00
|
|
|
def __repr__(self) -> str:
|
2024-04-29 15:06:31 +00:00
|
|
|
return f"Src({self.page_enum.value}: {shorten_display_url(self.url)})"
|
2023-01-30 13:41:02 +00:00
|
|
|
|
2024-04-29 20:37:07 +00:00
|
|
|
def __merge__(self, other: Source, **kwargs):
|
2024-04-29 15:06:31 +00:00
|
|
|
if self.audio_url is None:
|
|
|
|
self.audio_url = other.audio_url
|
|
|
|
self.additional_data.update(other.additional_data)
|
2024-04-16 11:23:20 +00:00
|
|
|
|
2023-01-20 22:05:15 +00:00
|
|
|
page_str = property(fget=lambda self: self.page_enum.value)
|
2023-01-25 13:14:15 +00:00
|
|
|
|
|
|
|
|
2024-04-29 12:40:49 +00:00
|
|
|
class SourceCollection:
|
2024-04-29 15:36:43 +00:00
|
|
|
__change_version__ = generate_id()
|
|
|
|
|
2024-04-29 15:06:31 +00:00
|
|
|
_indexed_sources: Dict[str, Source]
|
2024-04-29 12:40:49 +00:00
|
|
|
_page_to_source_list: Dict[SourcePages, List[Source]]
|
|
|
|
|
2023-12-29 14:43:33 +00:00
|
|
|
def __init__(self, data: Optional[Iterable[Source]] = None, **kwargs):
|
2024-04-29 12:40:49 +00:00
|
|
|
self._page_to_source_list = defaultdict(list)
|
2024-04-29 15:06:31 +00:00
|
|
|
self._indexed_sources = {}
|
|
|
|
|
|
|
|
self.extend(data or [])
|
2023-03-09 21:14:39 +00:00
|
|
|
|
2024-04-29 15:19:09 +00:00
|
|
|
def has_source_page(self, *source_pages: SourcePages) -> bool:
|
|
|
|
return any(source_page in self._page_to_source_list for source_page in source_pages)
|
|
|
|
|
2024-04-29 12:40:49 +00:00
|
|
|
def get_sources(self, *source_pages: List[Source]) -> Generator[Source]:
|
2024-04-29 15:06:31 +00:00
|
|
|
if not len(source_pages):
|
|
|
|
source_pages = self.source_pages
|
|
|
|
|
2024-04-29 12:40:49 +00:00
|
|
|
for page in source_pages:
|
|
|
|
yield from self._page_to_source_list[page]
|
2023-03-10 09:13:35 +00:00
|
|
|
|
2024-04-29 12:40:49 +00:00
|
|
|
def append(self, source: Source):
|
2024-05-06 16:40:21 +00:00
|
|
|
if source is None:
|
|
|
|
return
|
|
|
|
|
2024-04-29 15:06:31 +00:00
|
|
|
existing_source = None
|
|
|
|
for key in source.indexing_values:
|
|
|
|
if key in self._indexed_sources:
|
|
|
|
existing_source = self._indexed_sources[key]
|
|
|
|
break
|
|
|
|
|
|
|
|
if existing_source is not None:
|
|
|
|
existing_source.__merge__(source)
|
|
|
|
source = existing_source
|
2024-04-29 15:27:12 +00:00
|
|
|
else:
|
|
|
|
self._page_to_source_list[source.page_enum].append(source)
|
2024-04-29 15:06:31 +00:00
|
|
|
|
2024-04-29 15:36:43 +00:00
|
|
|
changed = False
|
2024-04-29 15:06:31 +00:00
|
|
|
for key in source.indexing_values:
|
2024-04-29 15:36:43 +00:00
|
|
|
if key not in self._indexed_sources:
|
|
|
|
changed = True
|
2024-04-29 15:06:31 +00:00
|
|
|
self._indexed_sources[key] = source
|
2023-03-09 21:14:39 +00:00
|
|
|
|
2024-04-29 15:36:43 +00:00
|
|
|
if changed:
|
|
|
|
self.__change_version__ = generate_id()
|
|
|
|
|
2024-04-29 12:40:49 +00:00
|
|
|
def extend(self, sources: Iterable[Source]):
|
|
|
|
for source in sources:
|
|
|
|
self.append(source)
|
2024-04-29 15:06:31 +00:00
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
yield from self.get_sources()
|
|
|
|
|
2024-04-29 20:37:07 +00:00
|
|
|
def __merge__(self, other: SourceCollection, **kwargs):
|
2024-04-29 15:06:31 +00:00
|
|
|
self.extend(other)
|
2023-05-26 09:41:20 +00:00
|
|
|
|
|
|
|
@property
|
2024-04-29 15:06:31 +00:00
|
|
|
def source_pages(self) -> Iterable[SourcePages]:
|
|
|
|
return sorted(self._page_to_source_list.keys(), key=lambda page: page.value)
|
2023-03-09 21:14:39 +00:00
|
|
|
|
2024-04-29 15:36:43 +00:00
|
|
|
@property
|
|
|
|
def hash_url_list(self) -> List[str]:
|
|
|
|
return [hash_url(source.url) for source in self.get_sources()]
|
|
|
|
|
2024-04-29 15:06:31 +00:00
|
|
|
@property
|
|
|
|
def url_list(self) -> List[str]:
|
2024-04-29 15:19:09 +00:00
|
|
|
return [source.url for source in self.get_sources()]
|
2024-04-29 15:06:31 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def homepage_list(self) -> List[str]:
|
2024-04-29 16:18:57 +00:00
|
|
|
return [source.homepage for source in self.source_pages]
|
|
|
|
|
|
|
|
def indexing_values(self) -> Generator[Tuple[str, str], None, None]:
|
|
|
|
for index in self._indexed_sources:
|
|
|
|
yield "url", index
|