music-kraken-core/src/music_kraken/connection/connection.py

194 lines
6.0 KiB
Python
Raw Normal View History

from typing import List, Dict, Callable, Optional, Set
2023-04-22 12:20:19 +00:00
from urllib.parse import urlparse, urlunsplit, ParseResult
import logging
import requests
from .rotating import RotatingProxy
from ..utils.shared import PROXIES_LIST
class Connection:
def __init__(
self,
host: str,
proxies: List[dict] = None,
tries: int = (len(PROXIES_LIST) + 1) * 2,
timeout: int = 7,
logger: logging.Logger = logging.getLogger("connection"),
header_values: Dict[str, str] = None,
accepted_response_codes: Set[int] = None,
semantic_not_found: bool = True
):
if proxies is None:
proxies = PROXIES_LIST
if header_values is None:
header_values = dict()
2023-04-22 12:20:19 +00:00
self.HEADER_VALUES = header_values
self.LOGGER = logger
self.HOST = urlparse(host)
self.TRIES = tries
self.TIMEOUT = timeout
self.rotating_proxy = RotatingProxy(proxy_list=proxies)
self.ACCEPTED_RESPONSE_CODES = accepted_response_codes or {200}
self.SEMANTIC_NOT_FOUND = semantic_not_found
2023-04-22 12:20:19 +00:00
self._session_map: Dict[str] = {
self.HOST.netloc: self.new_session()
}
2023-04-22 12:20:19 +00:00
def base_url(self, url: ParseResult = None):
if url is None:
url = self.HOST
2023-04-22 12:20:19 +00:00
return urlunsplit((url.scheme, url.netloc, "", "", ""))
def _register_session(self, session: requests.Session, **header_values):
session.headers = self.get_header(**header_values)
self.rotating_proxy.register_session(session)
2023-04-22 12:20:19 +00:00
def new_session(
self,
url: ParseResult = None,
refer_from_origin: bool = True
) -> requests.Session:
header_values = self.HEADER_VALUES.copy()
if url is not None:
header_values["Host"] = url.netloc
if not refer_from_origin:
header_values["Referer"] = self.base_url(url=url)
session = requests.Session()
self._register_session(session=session, **header_values)
return session
def get_header(self, **header_values) -> Dict[str, str]:
return {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Connection": "keep-alive",
"Host": self.HOST.netloc,
2023-04-22 12:20:19 +00:00
"Referer": self.base_url(),
**header_values
}
def rotate(self):
self.rotating_proxy.rotate()
2023-04-22 12:20:19 +00:00
def get_session_from_url(self, url: str, refer_from_origin: bool = True) -> requests.Session:
parsed_url = urlparse(url)
if parsed_url.netloc in self._session_map:
print("saved session")
return self._session_map[parsed_url.netloc]
self._session_map[parsed_url.netloc] = self.new_session(
url=parsed_url,
refer_from_origin=refer_from_origin
)
return self._session_map[parsed_url.netloc]
def _request(
self,
request: Callable,
try_count: int,
accepted_response_code: set,
url: str,
timeout: float,
**kwargs
) -> Optional[requests.Response]:
if try_count >= self.TRIES:
return
if timeout is None:
timeout = self.TIMEOUT
retry = False
try:
r = request(url=url, timeout=timeout, **kwargs)
except requests.exceptions.Timeout:
self.LOGGER.warning(f"Request timed out at \"{url}\": ({try_count}-{self.TRIES})")
retry = True
except requests.exceptions.ConnectionError:
self.LOGGER.warning(f"Couldn't connect to \"{url}\": ({try_count}-{self.TRIES})")
retry = True
if not retry:
if self.SEMANTIC_NOT_FOUND and r.status_code == 404:
self.LOGGER.warning(f"Couldn't find url (404): {url}")
return
if r.status_code in accepted_response_code:
return r
2023-04-22 12:20:19 +00:00
print(r.content)
print(r.headers)
if not retry:
self.LOGGER.warning(f"{self.HOST.netloc} responded wit {r.status_code} "
f"at {url}. ({try_count}-{self.TRIES})")
self.LOGGER.debug(r.content)
self.rotate()
return self._request(
request=request,
try_count=try_count,
accepted_response_code=accepted_response_code,
url=url,
2023-04-22 12:20:19 +00:00
timeout=timeout,
**kwargs
)
def get(
self,
url: str,
2023-04-22 12:20:19 +00:00
refer_from_origin: bool = True,
stream: bool = False,
accepted_response_codes: set = None,
timeout: float = None,
**kwargs
) -> Optional[requests.Response]:
2023-04-22 12:20:19 +00:00
s = self.get_session_from_url(url, refer_from_origin)
r = self._request(
2023-04-22 12:20:19 +00:00
request=s.get,
try_count=0,
accepted_response_code=accepted_response_codes or self.ACCEPTED_RESPONSE_CODES,
url=url,
stream=stream,
timeout=timeout,
**kwargs
)
if r is None:
self.LOGGER.warning(f"Max attempts ({self.TRIES}) exceeded for: GET:{url}")
return r
def post(
self,
url: str,
json: dict,
2023-04-22 12:20:19 +00:00
refer_from_origin: bool = True,
stream: bool = False,
accepted_response_codes: set = None,
timeout: float = None,
**kwargs
) -> Optional[requests.Response]:
r = self._request(
2023-04-22 12:20:19 +00:00
request=self.get_session_from_url(url, refer_from_origin).post,
try_count=0,
accepted_response_code=accepted_response_codes or self.ACCEPTED_RESPONSE_CODES,
url=url,
timeout=timeout,
json=json,
stream=stream,
**kwargs
)
if r is None:
self.LOGGER.warning(f"Max attempts ({self.TRIES}) exceeded for: GET:{url}")
self.LOGGER.warning(f"payload: {json}")
return r