failed attempts

This commit is contained in:
Hellow 2023-04-23 12:08:39 +02:00
parent 12ef9eb3dd
commit bd3e042ae2
6 changed files with 54 additions and 67 deletions

View File

@ -53,7 +53,7 @@ def real_download():
if __name__ == "__main__": if __name__ == "__main__":
music_kraken.cli(genre="test", command_list=[ music_kraken.cli(genre="test", command_list=[
"#a Molchat Doma", # "https://musify.club/release/molchat-doma-etazhi-2018-1092949",
"0", "https://musify.club/release/ghost-bath-self-loather-2021-1554266",
"ok" "ok"
]) ])

View File

@ -36,9 +36,9 @@ class Connection:
self.ACCEPTED_RESPONSE_CODES = accepted_response_codes or {200} self.ACCEPTED_RESPONSE_CODES = accepted_response_codes or {200}
self.SEMANTIC_NOT_FOUND = semantic_not_found self.SEMANTIC_NOT_FOUND = semantic_not_found
self._session_map: Dict[str] = { self.session = requests.Session()
self.HOST.netloc: self.new_session() self.session.headers = self.get_header(**self.HEADER_VALUES)
} self.session.proxies = self.rotating_proxy.current_proxy
def base_url(self, url: ParseResult = None): def base_url(self, url: ParseResult = None):
if url is None: if url is None:
@ -46,31 +46,9 @@ class Connection:
return urlunsplit((url.scheme, url.netloc, "", "", "")) return urlunsplit((url.scheme, url.netloc, "", "", ""))
def _register_session(self, session: requests.Session, **header_values):
session.headers = self.get_header(**header_values)
self.rotating_proxy.register_session(session)
def new_session(
self,
url: ParseResult = None,
refer_from_origin: bool = True
) -> requests.Session:
header_values = self.HEADER_VALUES.copy()
if url is not None:
header_values["Host"] = url.netloc
if not refer_from_origin:
header_values["Referer"] = self.base_url(url=url)
session = requests.Session()
self._register_session(session=session, **header_values)
return session
def get_header(self, **header_values) -> Dict[str, str]: def get_header(self, **header_values) -> Dict[str, str]:
return { return {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0", "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Connection": "keep-alive", "Connection": "keep-alive",
"Host": self.HOST.netloc, "Host": self.HOST.netloc,
"Referer": self.base_url(), "Referer": self.base_url(),
@ -78,20 +56,21 @@ class Connection:
} }
def rotate(self): def rotate(self):
self.rotating_proxy.rotate() self.session.proxies = self.rotating_proxy.rotate()
def get_session_from_url(self, url: str, refer_from_origin: bool = True) -> requests.Session: def _update_headers(
parsed_url = urlparse(url) self,
headers: Optional[dict],
refer_from_origin: bool,
url: ParseResult
) -> Dict[str, str]:
if headers is None:
headers = dict()
if parsed_url.netloc in self._session_map: if not refer_from_origin:
print("saved session") headers["Referer"] = self.base_url(url=url)
return self._session_map[parsed_url.netloc]
self._session_map[parsed_url.netloc] = self.new_session( return headers
url=parsed_url,
refer_from_origin=refer_from_origin
)
return self._session_map[parsed_url.netloc]
def _request( def _request(
self, self,
@ -100,6 +79,8 @@ class Connection:
accepted_response_code: set, accepted_response_code: set,
url: str, url: str,
timeout: float, timeout: float,
headers: dict,
refer_from_origin: bool = True,
**kwargs **kwargs
) -> Optional[requests.Response]: ) -> Optional[requests.Response]:
if try_count >= self.TRIES: if try_count >= self.TRIES:
@ -108,9 +89,20 @@ class Connection:
if timeout is None: if timeout is None:
timeout = self.TIMEOUT timeout = self.TIMEOUT
parsed_url = urlparse(url)
print(url)
print(parsed_url)
headers = self._update_headers(
headers=headers,
refer_from_origin=refer_from_origin,
url=parsed_url
)
retry = False retry = False
try: try:
r = request(url=url, timeout=timeout, **kwargs) r: requests.Response = request(url=parsed_url.geturl(), timeout=timeout, headers=headers, **kwargs)
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
self.LOGGER.warning(f"Request timed out at \"{url}\": ({try_count}-{self.TRIES})") self.LOGGER.warning(f"Request timed out at \"{url}\": ({try_count}-{self.TRIES})")
retry = True retry = True
@ -121,19 +113,19 @@ class Connection:
if not retry: if not retry:
if self.SEMANTIC_NOT_FOUND and r.status_code == 404: if self.SEMANTIC_NOT_FOUND and r.status_code == 404:
self.LOGGER.warning(f"Couldn't find url (404): {url}") self.LOGGER.warning(f"Couldn't find url (404): {url}")
print(r.headers)
print(r.request.headers)
return return
if r.status_code in accepted_response_code: if r.status_code in accepted_response_code:
return r return r
print(r.content)
print(r.headers)
if not retry: if not retry:
self.LOGGER.warning(f"{self.HOST.netloc} responded wit {r.status_code} " self.LOGGER.warning(f"{self.HOST.netloc} responded wit {r.status_code} "
f"at {url}. ({try_count}-{self.TRIES})") f"at {url}. ({try_count}-{self.TRIES})")
self.LOGGER.debug(r.content) self.LOGGER.debug(r.content)
self.rotate() self.rotate()
print(r.headers)
return self._request( return self._request(
request=request, request=request,
@ -151,16 +143,18 @@ class Connection:
stream: bool = False, stream: bool = False,
accepted_response_codes: set = None, accepted_response_codes: set = None,
timeout: float = None, timeout: float = None,
headers: dict = None,
**kwargs **kwargs
) -> Optional[requests.Response]: ) -> Optional[requests.Response]:
s = self.get_session_from_url(url, refer_from_origin)
r = self._request( r = self._request(
request=s.get, request=self.session.get,
try_count=0, try_count=0,
accepted_response_code=accepted_response_codes or self.ACCEPTED_RESPONSE_CODES, accepted_response_code=accepted_response_codes or self.ACCEPTED_RESPONSE_CODES,
url=url, url=url,
stream=stream, stream=stream,
timeout=timeout, timeout=timeout,
headers=headers,
refer_from_origin=refer_from_origin,
**kwargs **kwargs
) )
if r is None: if r is None:
@ -175,14 +169,17 @@ class Connection:
stream: bool = False, stream: bool = False,
accepted_response_codes: set = None, accepted_response_codes: set = None,
timeout: float = None, timeout: float = None,
headers: dict = None,
**kwargs **kwargs
) -> Optional[requests.Response]: ) -> Optional[requests.Response]:
r = self._request( r = self._request(
request=self.get_session_from_url(url, refer_from_origin).post, request=self.session.post,
try_count=0, try_count=0,
accepted_response_code=accepted_response_codes or self.ACCEPTED_RESPONSE_CODES, accepted_response_code=accepted_response_codes or self.ACCEPTED_RESPONSE_CODES,
url=url, url=url,
timeout=timeout, timeout=timeout,
headers=headers,
refer_from_origin=refer_from_origin,
json=json, json=json,
stream=stream, stream=stream,
**kwargs **kwargs

View File

@ -30,27 +30,14 @@ class RotatingObject:
class RotatingProxy(RotatingObject): class RotatingProxy(RotatingObject):
def __init__(self, proxy_list: List[Dict[str, str]], session_list: List[requests.Session] = None): def __init__(self, proxy_list: List[Dict[str, str]]):
self._session_list: List[requests.Session] = session_list super().__init__(
if self._session_list is None: proxy_list if len(proxy_list) > 0 else [None]
self._session_list = [] )
super().__init__(proxy_list if len(proxy_list) > 0 else [{}]) def rotate(self) -> Dict[str, str]:
return self.next
def register_session(self, session: requests.Session):
self._session_list.append(session)
session.proxies = self.current_proxy
def rotate(self):
new_proxy = self.next
for session in self._session_list:
session.proxies = new_proxy
@property @property
def current_proxy(self) -> Dict[str, str]: def current_proxy(self) -> Dict[str, str]:
return super().object return super().object
@property
def next(self) -> Dict[str, str]:
return super().object

View File

@ -6,6 +6,7 @@ from .multiple_options import MultiPageOptions
from ..abstract import Page from ..abstract import Page
from ..support_classes.download_result import DownloadResult from ..support_classes.download_result import DownloadResult
from ...objects import DatabaseObject, Source from ...objects import DatabaseObject, Source
from ...utils.enums.source import SourcePages
class Search(Download): class Search(Download):
@ -116,7 +117,7 @@ class Search(Download):
can download directly after can download directly after
""" """
source = Source.match_url(url=url) source = Source.match_url(url=url, referer_page=SourcePages.MANUAL)
if source is None: if source is None:
return False return False

View File

@ -1039,7 +1039,7 @@ class Musify(Page):
cls.LOGGER.warning(f"The source has no audio link. Falling back to {endpoint}.") cls.LOGGER.warning(f"The source has no audio link. Falling back to {endpoint}.")
r = cls.CONNECTION.get(endpoint, stream=True, allow_redirects=True, headers={"Host": "40s.musify.club", "Referer": endpoint}) r = cls.CONNECTION.get(endpoint, stream=True, allow_redirects=True, headers={"Connection": "https://musify.club/"})
if r is None: if r is None:
return DownloadResult(error_message=f"couldn't connect to {endpoint}") return DownloadResult(error_message=f"couldn't connect to {endpoint}")

View File

@ -25,6 +25,8 @@ class SourcePages(Enum):
TWITTER = "twitter" # I will use nitter though lol TWITTER = "twitter" # I will use nitter though lol
MYSPACE = "myspace" # Yes somehow this ancient site is linked EVERYWHERE MYSPACE = "myspace" # Yes somehow this ancient site is linked EVERYWHERE
MANUAL = "manual"
@classmethod @classmethod
def get_homepage(cls, attribute) -> str: def get_homepage(cls, attribute) -> str:
homepage_map = { homepage_map = {