
373 lines
13 KiB
Raw Normal View History

2024-02-28 13:27:35 +00:00
from __future__ import annotations
import logging
import threading
import time
from typing import List, Dict, Optional, Set
2023-04-22 12:20:19 +00:00
from urllib.parse import urlparse, urlunsplit, ParseResult
import copy
import inspect
import requests
import responses
2023-06-15 16:22:00 +00:00
from tqdm import tqdm
from .cache import Cache
from .rotating import RotatingProxy
from ..objects import Target
from ..utils import request_trace
from ..utils.string_processing import shorten_display_url
2023-09-10 14:27:09 +00:00
from ..utils.config import main_settings
2023-10-23 14:21:44 +00:00
from ..utils.support_classes.download_result import DownloadResult
2024-02-28 13:27:35 +00:00
from ..utils.hacking import merge_args
class Connection:
def __init__(
2024-04-10 16:18:52 +00:00
host: str = None,
proxies: List[dict] = None,
tries: int = (len(main_settings["proxies"]) + 1) * main_settings["tries_per_proxy"],
timeout: int = 7,
logger: logging.Logger = logging.getLogger("connection"),
header_values: Dict[str, str] = None,
accepted_response_codes: Set[int] = None,
semantic_not_found: bool = True,
sleep_after_404: float = 0.0,
module: str = "general",
cache_expiring_duration: float = 10
if proxies is None:
2023-09-10 14:27:09 +00:00
proxies = main_settings["proxies"]
self.cache: Cache = Cache(module=module, logger=logger)
self.cache_expiring_duration = cache_expiring_duration
2024-02-28 13:52:02 +00:00
self.HEADER_VALUES = dict() if header_values is None else header_values
2023-04-22 12:20:19 +00:00
self.LOGGER = logger
2024-04-10 16:18:52 +00:00
self.HOST = host if host is None else urlparse(host)
self.TRIES = tries
self.TIMEOUT = timeout
self.rotating_proxy = RotatingProxy(proxy_list=proxies)
self.ACCEPTED_RESPONSE_CODES = accepted_response_codes or {200}
self.SEMANTIC_NOT_FOUND = semantic_not_found
self.sleep_after_404 = sleep_after_404
2023-04-23 10:08:39 +00:00
self.session = requests.Session()
self.session.headers = self.get_header(**self.HEADER_VALUES)
self.session.proxies = self.rotating_proxy.current_proxy
2023-09-12 06:45:48 +00:00
self.heartbeat_thread = None
self.heartbeat_interval = heartbeat_interval
self.lock: bool = False
2023-09-12 06:45:48 +00:00
def start_heartbeat(self):
if self.heartbeat_interval <= 0:
self.LOGGER.warning(f"Can't start a heartbeat with {self.heartbeat_interval}s in between.")
self.heartbeat_thread = threading.Thread(target=self._heartbeat_loop, args=(self.heartbeat_interval,), daemon=True)
2023-09-12 06:45:48 +00:00
2023-09-12 06:45:48 +00:00
def heartbeat_failed(self):
self.LOGGER.warning(f"The hearth couldn't beat.")
2023-09-12 06:45:48 +00:00
def heartbeat(self):
# Your code to send heartbeat requests goes here
raise NotImplementedError("please implement the heartbeat function.")
2023-09-12 06:45:48 +00:00
def _heartbeat_loop(self, interval: float):
def heartbeat_wrapper():
self.LOGGER.debug(f"The hearth is beating.")
2023-09-12 06:45:48 +00:00
while True:
2023-09-12 06:45:48 +00:00
2023-04-22 12:20:19 +00:00
def base_url(self, url: ParseResult = None):
2024-04-10 16:18:52 +00:00
if url is None and self.HOST is not None:
2023-04-22 12:20:19 +00:00
url = self.HOST
2023-04-22 12:20:19 +00:00
return urlunsplit((url.scheme, url.netloc, "", "", ""))
def get_header(self, **header_values) -> Dict[str, str]:
2024-04-10 16:18:52 +00:00
headers = {
2024-01-22 20:39:39 +00:00
"user-agent": main_settings["user_agent"],
"User-Agent": main_settings["user_agent"],
"Connection": "keep-alive",
2024-01-22 20:39:39 +00:00
"Accept-Language": main_settings["language"],
2024-04-10 16:18:52 +00:00
if self.HOST is not None:
2024-04-16 12:19:07 +00:00
# headers["Host"] = self.HOST.netloc
2024-04-10 16:18:52 +00:00
headers["Referer"] = self.base_url(url=self.HOST)
return headers
def rotate(self):
2023-04-23 10:08:39 +00:00
self.session.proxies = self.rotating_proxy.rotate()
2023-04-23 10:08:39 +00:00
def _update_headers(
headers: Optional[dict],
refer_from_origin: bool,
url: ParseResult
) -> Dict[str, str]:
2024-01-22 20:39:39 +00:00
headers = self.get_header(**(headers or {}))
2023-04-23 10:08:39 +00:00
if not refer_from_origin:
2024-04-11 18:13:12 +00:00
headers["Referer"] = self.base_url(url=url)
2023-04-22 12:20:19 +00:00
2023-04-23 10:08:39 +00:00
return headers
2023-04-22 12:20:19 +00:00
2024-04-26 11:50:17 +00:00
def save(self, r: requests.Response, name: str, error: bool = False, no_update_if_valid_exists: bool = False, **kwargs):
2024-01-22 20:39:39 +00:00
n_kwargs = {}
if error:
n_kwargs["module"] = "failed_requests"
2024-04-26 11:50:17 +00:00
if self.cache.get(name) is not None and no_update_if_valid_exists:
2024-04-26 12:04:44 +00:00
self.cache.set(r.content, name, expires_in=kwargs.get("expires_in", self.cache_expiring_duration), additional_info={
2024-04-26 12:24:14 +00:00
"encoding": r.encoding,
2024-04-26 12:04:44 +00:00
}, **n_kwargs)
def request(
url: str,
timeout: float = None,
headers: Optional[dict] = None,
try_count: int = 0,
accepted_response_codes: set = None,
2023-04-23 10:08:39 +00:00
refer_from_origin: bool = True,
2023-05-03 13:16:01 +00:00
raw_url: bool = False,
2024-04-08 16:10:42 +00:00
raw_headers: bool = False,
sleep_after_404: float = None,
2023-09-12 06:45:48 +00:00
is_heartbeat: bool = False,
disable_cache: bool = None,
2024-04-26 11:50:17 +00:00
enable_cache_readonly: bool = False,
method: str = None,
name: str = "",
2024-04-11 18:13:12 +00:00
exclude_headers: List[str] = None,
) -> Optional[requests.Response]:
if method is None:
raise AttributeError("method is not set.")
method = method.upper()
2024-02-28 13:27:35 +00:00
headers = dict() if headers is None else headers
disable_cache = (headers.get("Cache-Control", "").lower() == "no-cache" if disable_cache is None else disable_cache) or kwargs.get("stream", False)
accepted_response_codes = self.ACCEPTED_RESPONSE_CODES if accepted_response_codes is None else accepted_response_codes
current_kwargs = copy.copy(locals())
2024-02-28 13:52:02 +00:00
2024-01-22 20:39:39 +00:00
parsed_url = urlparse(url)
trace_string = f"{method} {shorten_display_url(url)} \t{'[stream]' if kwargs.get('stream', False) else ''}"
2024-04-08 16:10:42 +00:00
if not raw_headers:
_headers = copy.copy(self.HEADER_VALUES)
headers = self._update_headers(
headers = headers or {}
2024-02-28 13:52:02 +00:00
2024-04-08 16:10:42 +00:00
request_url = parsed_url.geturl() if not raw_url else url
2024-01-22 20:39:39 +00:00
2024-04-26 11:50:17 +00:00
if name != "" and (not disable_cache or enable_cache_readonly):
cached = self.cache.get(name)
2024-01-22 20:39:39 +00:00
if cached is not None:
2024-01-22 20:39:39 +00:00
with responses.RequestsMock() as resp:
2024-04-26 12:24:14 +00:00
additional_info = cached.attribute.additional_info
2024-04-26 12:04:44 +00:00
body = cached.content
if additional_info.get("encoding", None) is not None:
2024-04-26 12:24:14 +00:00
body = body.decode(additional_info["encoding"])
2024-04-26 12:04:44 +00:00
2024-01-22 20:39:39 +00:00
2024-04-08 16:10:42 +00:00
2024-04-26 12:04:44 +00:00
2024-01-22 20:39:39 +00:00
return requests.request(method=method, url=url, timeout=timeout, headers=headers, **kwargs)
if sleep_after_404 is None:
sleep_after_404 = self.sleep_after_404
if try_count >= self.TRIES:
if timeout is None:
timeout = self.TIMEOUT
2024-04-11 18:13:12 +00:00
for header in exclude_headers or []:
if header in headers:
del headers[header]
if try_count <= 0:
2024-01-22 20:39:39 +00:00
r = None
connection_failed = False
if self.lock:
2023-09-12 06:45:48 +00:00"Waiting for the heartbeat to finish.")
while self.lock and not is_heartbeat:
self.lock = True
2024-04-24 10:12:09 +00:00
r: requests.Response = self.session.request(method=method, url=url, timeout=timeout, headers=headers, **kwargs)
2023-06-16 21:27:08 +00:00
if r.status_code in accepted_response_codes:
2024-01-22 20:39:39 +00:00
if not disable_cache:, name, **kwargs)
return r
# the server rejected the request, or the internet is lacking
except requests.exceptions.Timeout:
2023-05-03 13:16:01 +00:00
self.LOGGER.warning(f"Request timed out at \"{request_url}\": ({try_count}-{self.TRIES})")
connection_failed = True
except requests.exceptions.ConnectionError:
2023-05-03 13:16:01 +00:00
self.LOGGER.warning(f"Couldn't connect to \"{request_url}\": ({try_count}-{self.TRIES})")
connection_failed = True
# this is important for thread safety
self.lock = False
2024-04-16 12:19:07 +00:00
if r is None:
self.LOGGER.warning(f"{parsed_url.netloc} didn't respond at {url}. ({try_count}-{self.TRIES})")
2024-04-16 12:19:07 +00:00
self.LOGGER.debug("request headers:\n\t"+ "\n\t".join(f"{k}\t=\t{v}" for k, v in headers.items()))
self.LOGGER.warning(f"{parsed_url.netloc} responded wit {r.status_code} at {url}. ({try_count}-{self.TRIES})")
2024-04-16 12:19:07 +00:00
self.LOGGER.debug("request headers:\n\t"+ "\n\t".join(f"{k}\t=\t{v}" for k, v in r.request.headers.items()))
self.LOGGER.debug("response headers:\n\t"+ "\n\t".join(f"{k}\t=\t{v}" for k, v in r.headers.items()))
if name != "":, name, error=True, **kwargs)
if self.SEMANTIC_NOT_FOUND and r.status_code == 404:
return None
2024-01-22 20:39:39 +00:00
if sleep_after_404 != 0:
self.LOGGER.warning(f"Waiting for {sleep_after_404} seconds.")
current_kwargs["try_count"] = current_kwargs.get("try_count", 0) + 1
2024-02-28 13:27:35 +00:00
return Connection.request(**current_kwargs)
def get(self, *args, **kwargs) -> Optional[requests.Response]:
return self.request(
def post(
json: dict = None,
) -> Optional[requests.Response]:
r = self.request(
if r is None:
self.LOGGER.warning(f"payload: {json}")
return r
2023-06-15 16:22:00 +00:00
2023-06-15 16:22:00 +00:00
def stream_into(
url: str,
target: Target,
name: str = "download",
2023-09-10 14:27:09 +00:00
chunk_size: int = main_settings["chunk_size"],
2023-06-16 10:26:02 +00:00
progress: int = 0,
method: str = "GET",
2024-04-08 16:10:42 +00:00
try_count: int = 0,
accepted_response_codes: set = None,
2023-06-15 16:22:00 +00:00
2023-06-16 10:26:02 +00:00
) -> DownloadResult:
2024-04-08 16:10:42 +00:00
accepted_response_codes = self.ACCEPTED_RESPONSE_CODES if accepted_response_codes is None else accepted_response_codes
stream_kwargs = copy.copy(locals())
2024-04-08 16:10:42 +00:00
if "description" in kwargs:
name = kwargs.pop("description")
2023-06-16 21:27:08 +00:00
if progress > 0:
2024-05-15 15:14:01 +00:00
headers = kwargs.get("headers", dict())
2023-06-16 21:27:08 +00:00
headers["Range"] = f"bytes={target.size}-"
r = self.request(
2023-06-15 16:22:00 +00:00
2024-02-28 13:27:35 +00:00
2024-04-08 16:10:42 +00:00
2023-06-15 16:22:00 +00:00
if r is None:
return DownloadResult(error_message=f"Could not establish a stream from: {url}")
2023-06-15 16:22:00 +00:00
2024-04-11 18:13:12 +00:00
total_size = int(r.headers.get('content-length', r.headers.get('Content-Length', chunk_size)))
2023-06-16 10:26:02 +00:00
progress = 0
retry = False
2023-06-15 16:22:00 +00:00
2023-06-16 21:27:08 +00:00
with"ab") as f:
> The internationally recommended unit symbol for the kilobyte is kB.
2024-04-09 08:32:17 +00:00
with tqdm(total=total_size, initial=target.size, unit='B', unit_scale=True, unit_divisor=1024, desc=name) as t:
2023-06-16 21:27:08 +00:00
2023-06-16 10:26:02 +00:00
for chunk in r.iter_content(chunk_size=chunk_size):
2023-06-15 16:22:00 +00:00
size = f.write(chunk)
2023-06-16 10:26:02 +00:00
progress += size
2023-06-15 16:22:00 +00:00
2024-04-08 16:10:42 +00:00
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, requests.exceptions.ChunkedEncodingError):
2023-06-16 21:27:08 +00:00
if try_count >= self.TRIES:
self.LOGGER.warning(f"Stream timed out at \"{url}\": to many retries, aborting.")
return DownloadResult(error_message=f"Stream timed out from {url}, reducing the chunk_size might help.")
2023-06-16 21:27:08 +00:00
self.LOGGER.warning(f"Stream timed out at \"{url}\": ({try_count}-{self.TRIES})")
retry = True
2024-04-08 16:10:42 +00:00
try_count += 1
2023-06-16 21:27:08 +00:00
if total_size > progress:
2023-06-16 10:26:02 +00:00
retry = True
2023-06-16 21:27:08 +00:00
if retry:
self.LOGGER.warning(f"Retrying stream...")
stream_kwargs["progress"] = progress
2024-02-28 13:52:02 +00:00
return Connection.stream_into(**stream_kwargs)
2023-06-16 21:27:08 +00:00
return DownloadResult()