music-kraken-core/music_kraken/objects/source.py

from __future__ import annotations

from collections import defaultdict
from enum import Enum
from typing import List, Dict, Set, Tuple, Optional, Iterable, Generator
from urllib.parse import urlparse, ParseResult
from dataclasses import dataclass, field
from functools import cached_property

from ..utils import generate_id
from ..utils.enums.source import SourcePages, SourceTypes
from ..utils.config import youtube_settings
from ..utils.string_processing import hash_url

from .metadata import Mapping, Metadata
from .parents import OuterProxy
from .collection import Collection


@dataclass
class Source:
    url: str
    page_enum: SourcePages
    referrer_page: SourcePages
    audio_url: Optional[str]

    id: int = field(default_factory=generate_id)
    additional_data: dict = field(default_factory=dict)

    def __post_init__(self):
        self.referrer_page = self.referrer_page or self.page_enum
    
    @cached_property
    def parsed_url(self) -> ParseResult:
        return urlparse(self.url)

    @classmethod
    def match_url(cls, url: str, referrer_page: SourcePages) -> Optional["Source"]:
        """
        this shouldn't be used, unlesse you are not certain what the source is for
        the reason is that it is more inefficient
        """
        parsed = urlparse(url)
        url = parsed.geturl()
        
        if "musify" in parsed.netloc:
            return cls(SourcePages.MUSIFY, url, referrer_page=referrer_page)

        if parsed.netloc in [_url.netloc for _url in youtube_settings['youtube_url']]:
            return cls(SourcePages.YOUTUBE, url, referrer_page=referrer_page)

        if url.startswith("https://www.deezer"):
            return cls(SourcePages.DEEZER, url, referrer_page=referrer_page)
        
        if url.startswith("https://open.spotify.com"):
            return cls(SourcePages.SPOTIFY, url, referrer_page=referrer_page)

        if "bandcamp" in url:
            return cls(SourcePages.BANDCAMP, url, referrer_page=referrer_page)

        if "wikipedia" in parsed.netloc:
            return cls(SourcePages.WIKIPEDIA, url, referrer_page=referrer_page)

        if url.startswith("https://www.metal-archives.com/"):
            return cls(SourcePages.ENCYCLOPAEDIA_METALLUM, url, referrer_page=referrer_page)

        # the less important once
        if url.startswith("https://www.facebook"):
            return cls(SourcePages.FACEBOOK, url, referrer_page=referrer_page)

        if url.startswith("https://www.instagram"):
            return cls(SourcePages.INSTAGRAM, url, referrer_page=referrer_page)

        if url.startswith("https://twitter"):
            return cls(SourcePages.TWITTER, url, referrer_page=referrer_page)

        if url.startswith("https://myspace.com"):
            return cls(SourcePages.MYSPACE, url, referrer_page=referrer_page)

    def get_song_metadata(self) -> Metadata:
        return Metadata({
            Mapping.FILE_WEBPAGE_URL: [self.url],
            Mapping.SOURCE_WEBPAGE_URL: [self.homepage]
        })

    def get_artist_metadata(self) -> Metadata:
        return Metadata({
            Mapping.ARTIST_WEBPAGE_URL: [self.url]
        })

    @property
    def hash_url(self) -> str:
        return hash_url(self.url)

    @property
    def metadata(self) -> Metadata:
        return self.get_song_metadata()

    @property
    def indexing_values(self) -> List[Tuple[str, object]]:
        return [
            ('id', self.id),
            ('url', self.url),
            ('audio_url', self.audio_url),
        ]
        
    def __str__(self):
        return self.__repr__()

    def __repr__(self) -> str:
        return f"Src({self.page_enum.value}: {self.url}, {self.audio_url})"

    @property
    def title_string(self) -> str:
        return self.url

    page_str = property(fget=lambda self: self.page_enum.value)
    type_str = property(fget=lambda self: self.type_enum.value)
    homepage = property(fget=lambda self: SourcePages.get_homepage(self.page_enum))


class SourceCollection:
    _page_to_source_list: Dict[SourcePages, List[Source]]


    def __init__(self, data: Optional[Iterable[Source]] = None, **kwargs):
        self._page_to_source_list = defaultdict(list)

    def get_sources(self, *source_pages: List[Source]) -> Generator[Source]:
        for page in source_pages:
            yield from self._page_to_source_list[page]

    def append(self, source: Source):
        pass

    def extend(self, sources: Iterable[Source]):
        for source in sources:
            self.append(source)
        
    @property
    def source_pages(self) -> Set[SourcePages]:
        return set(source.page_enum for source in self._data)

    def get_sources_from_page(self, source_page: SourcePages) -> List[Source]:
        """
        getting the sources for a specific page like
        YouTube or musify
        """
        return self._page_to_source_list[source_page].copy()
fix: raised the recursion limit in debug to 500 2024-04-16 11:23:20 +00:00			`from __future__ import annotations`

refactorings 2023-03-09 21:14:39 +00:00			`from collections import defaultdict`
dasfh 2023-01-12 15:25:50 +00:00			`from enum import Enum`
draft: string processing 2024-04-29 12:40:49 +00:00			`from typing import List, Dict, Set, Tuple, Optional, Iterable, Generator`
			`from urllib.parse import urlparse, ParseResult`
			`from dataclasses import dataclass, field`
			`from functools import cached_property`
dasfh 2023-01-12 15:25:50 +00:00
draft: string processing 2024-04-29 12:40:49 +00:00			`from ..utils import generate_id`
refactored source 2023-04-18 09:18:17 +00:00			`from ..utils.enums.source import SourcePages, SourceTypes`
started to migrate to new config 2023-09-10 14:27:09 +00:00			`from ..utils.config import youtube_settings`
feat: added base functionality of artwork class 2024-04-10 14:39:46 +00:00			`from ..utils.string_processing import hash_url`
started to migrate to new config 2023-09-10 14:27:09 +00:00
refactor 2023-03-10 08:09:35 +00:00			`from .metadata import Mapping, Metadata`
feat: fixed bugs 2023-12-19 21:11:46 +00:00			`from .parents import OuterProxy`
refactorings 2023-03-09 21:14:39 +00:00			`from .collection import Collection`
dasfh 2023-01-12 15:25:50 +00:00
added source to artist 2023-01-20 22:05:15 +00:00
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00
			`@dataclass`
draft: string processing 2024-04-29 12:40:49 +00:00			`class Source:`
feat: added annotations for init 2023-12-29 20:16:09 +00:00			`url: str`
feat: fixed bugs 2023-12-19 21:11:46 +00:00			`page_enum: SourcePages`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`referrer_page: SourcePages`
draft: string processing 2024-04-29 12:40:49 +00:00			`audio_url: Optional[str]`
feat: fixed bugs 2023-12-19 21:11:46 +00:00
draft: string processing 2024-04-29 12:40:49 +00:00			`id: int = field(default_factory=generate_id)`
			`additional_data: dict = field(default_factory=dict)`
feat: fixed bugs 2023-12-19 21:11:46 +00:00
draft: string processing 2024-04-29 12:40:49 +00:00			`def __post_init__(self):`
			`self.referrer_page = self.referrer_page or self.page_enum`

			`@cached_property`
			`def parsed_url(self) -> ParseResult:`
			`return urlparse(self.url)`
dasfh 2023-01-12 15:25:50 +00:00
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00			`@classmethod`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`def match_url(cls, url: str, referrer_page: SourcePages) -> Optional["Source"]:`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00			`"""`
			`this shouldn't be used, unlesse you are not certain what the source is for`
			`the reason is that it is more inefficient`
			`"""`
added fetching of linked sources to musify 2023-03-18 12:01:27 +00:00			`parsed = urlparse(url)`
			`url = parsed.geturl()`
made the direct downloads work 2023-03-30 10:31:37 +00:00
			`if "musify" in parsed.netloc:`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.MUSIFY, url, referrer_page=referrer_page)`
added fetching of linked sources to musify 2023-03-18 12:01:27 +00:00
started to migrate to new config 2023-09-10 14:27:09 +00:00			`if parsed.netloc in [_url.netloc for _url in youtube_settings['youtube_url']]:`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.YOUTUBE, url, referrer_page=referrer_page)`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00
			`if url.startswith("https://www.deezer"):`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.DEEZER, url, referrer_page=referrer_page)`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00
			`if url.startswith("https://open.spotify.com"):`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.SPOTIFY, url, referrer_page=referrer_page)`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00
			`if "bandcamp" in url:`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.BANDCAMP, url, referrer_page=referrer_page)`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00
added wikipedia 2023-03-18 16:06:12 +00:00			`if "wikipedia" in parsed.netloc:`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.WIKIPEDIA, url, referrer_page=referrer_page)`
added wikipedia 2023-03-18 16:06:12 +00:00
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00			`if url.startswith("https://www.metal-archives.com/"):`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.ENCYCLOPAEDIA_METALLUM, url, referrer_page=referrer_page)`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00
			`# the less important once`
			`if url.startswith("https://www.facebook"):`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.FACEBOOK, url, referrer_page=referrer_page)`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00
			`if url.startswith("https://www.instagram"):`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.INSTAGRAM, url, referrer_page=referrer_page)`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00
			`if url.startswith("https://twitter"):`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.TWITTER, url, referrer_page=referrer_page)`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00
added fetching of linked sources to musify 2023-03-18 12:01:27 +00:00			`if url.startswith("https://myspace.com"):`
feat: renamed referrer page fixing typo 2024-04-29 11:51:43 +00:00			`return cls(SourcePages.MYSPACE, url, referrer_page=referrer_page)`
added fetching of linked sources to musify 2023-03-18 12:01:27 +00:00
fixed exceptions 2023-03-10 09:13:35 +00:00			`def get_song_metadata(self) -> Metadata:`
			`return Metadata({`
finished refactoring metadata 2023-01-30 13:41:02 +00:00			`Mapping.FILE_WEBPAGE_URL: [self.url],`
			`Mapping.SOURCE_WEBPAGE_URL: [self.homepage]`
			`})`

fixed exceptions 2023-03-10 09:13:35 +00:00			`def get_artist_metadata(self) -> Metadata:`
			`return Metadata({`
finished refactoring metadata 2023-01-30 13:41:02 +00:00			`Mapping.ARTIST_WEBPAGE_URL: [self.url]`
			`})`

feat: massive improvements to the fetch and download order 2024-04-10 09:20:49 +00:00			`@property`
			`def hash_url(self) -> str:`
feat: added base functionality of artwork class 2024-04-10 14:39:46 +00:00			`return hash_url(self.url)`
feat: massive improvements to the fetch and download order 2024-04-10 09:20:49 +00:00
refactor 2023-03-10 08:09:35 +00:00			`@property`
			`def metadata(self) -> Metadata:`
fixed crash while tagging that arouse from refactor 2023-04-18 13:31:41 +00:00			`return self.get_song_metadata()`
dfsa 2023-01-12 16:14:21 +00:00
implemented DatabaseObject.indexing_values for each data objects 2023-03-09 18:53:28 +00:00			`@property`
			`def indexing_values(self) -> List[Tuple[str, object]]:`
			`return [`
			`('id', self.id),`
Fixed bug with inconsistent dynamic creation of direct download links 2023-04-18 13:24:39 +00:00			`('url', self.url),`
			`('audio_url', self.audio_url),`
implemented DatabaseObject.indexing_values for each data objects 2023-03-09 18:53:28 +00:00			`]`
fix: recursion depth error 2024-04-18 13:30:04 +00:00
dasfh 2023-01-12 15:25:50 +00:00			`def __str__(self):`
added the fetching of the sources from ma regarding artists 2023-02-01 08:10:05 +00:00			`return self.__repr__()`
dasfh 2023-01-12 15:25:50 +00:00
finished refactoring metadata 2023-01-30 13:41:02 +00:00			`def __repr__(self) -> str:`
Fixed bug with inconsistent dynamic creation of direct download links 2023-04-18 13:24:39 +00:00			`return f"Src({self.page_enum.value}: {self.url}, {self.audio_url})"`
finished refactoring metadata 2023-01-30 13:41:02 +00:00
fix: raised the recursion limit in debug to 500 2024-04-16 11:23:20 +00:00			`@property`
			`def title_string(self) -> str:`
			`return self.url`

added source to artist 2023-01-20 22:05:15 +00:00			`page_str = property(fget=lambda self: self.page_enum.value)`
added type to src 2023-01-20 09:56:40 +00:00			`type_str = property(fget=lambda self: self.type_enum.value)`
added source to artist 2023-01-20 22:05:15 +00:00			`homepage = property(fget=lambda self: SourcePages.get_homepage(self.page_enum))`
source 2023-01-25 13:14:15 +00:00

draft: string processing 2024-04-29 12:40:49 +00:00			`class SourceCollection:`
			`_page_to_source_list: Dict[SourcePages, List[Source]]`


fixed the mapping in source maps 2023-12-29 14:43:33 +00:00			`def __init__(self, data: Optional[Iterable[Source]] = None, **kwargs):`
draft: string processing 2024-04-29 12:40:49 +00:00			`self._page_to_source_list = defaultdict(list)`
refactorings 2023-03-09 21:14:39 +00:00
draft: string processing 2024-04-29 12:40:49 +00:00			`def get_sources(self, *source_pages: List[Source]) -> Generator[Source]:`
			`for page in source_pages:`
			`yield from self._page_to_source_list[page]`
fixed exceptions 2023-03-10 09:13:35 +00:00
draft: string processing 2024-04-29 12:40:49 +00:00			`def append(self, source: Source):`
			`pass`
refactorings 2023-03-09 21:14:39 +00:00
draft: string processing 2024-04-29 12:40:49 +00:00			`def extend(self, sources: Iterable[Source]):`
			`for source in sources:`
			`self.append(source)`
added wrapper methods, for the basic fuctionalities of the webscraper 2023-05-26 09:41:20 +00:00
			`@property`
			`def source_pages(self) -> Set[SourcePages]:`
			`return set(source.page_enum for source in self._data)`
refactorings 2023-03-09 21:14:39 +00:00
			`def get_sources_from_page(self, source_page: SourcePages) -> List[Source]:`
			`"""`
			`getting the sources for a specific page like`
			`YouTube or musify`
			`"""`
added fetching of linked sources to musify 2023-03-18 12:01:27 +00:00			`return self._page_to_source_list[source_page].copy()`