2023-06-13 18:10:11 +00:00
import time
2023-04-20 16:47:47 +00:00
from typing import List , Dict , Callable , Optional , Set
2023-04-22 12:20:19 +00:00
from urllib . parse import urlparse , urlunsplit , ParseResult
2023-04-20 16:47:47 +00:00
import logging
2023-07-17 19:19:19 +00:00
import threading
2023-04-20 16:47:47 +00:00
import requests
2023-06-15 16:22:00 +00:00
from tqdm import tqdm
2023-04-20 16:47:47 +00:00
from . rotating import RotatingProxy
2023-09-10 14:27:09 +00:00
from . . utils . config import main_settings
2023-06-16 10:26:02 +00:00
from . . utils . support_classes import DownloadResult
2023-06-15 16:22:00 +00:00
from . . objects import Target
2023-04-20 16:47:47 +00:00
class Connection :
def __init__ (
self ,
host : str ,
proxies : List [ dict ] = None ,
2023-09-10 14:27:09 +00:00
tries : int = ( len ( main_settings [ " proxies " ] ) + 1 ) * 4 ,
2023-04-20 16:47:47 +00:00
timeout : int = 7 ,
2023-04-20 17:37:41 +00:00
logger : logging . Logger = logging . getLogger ( " connection " ) ,
2023-04-20 16:47:47 +00:00
header_values : Dict [ str , str ] = None ,
accepted_response_codes : Set [ int ] = None ,
2023-06-22 12:30:26 +00:00
semantic_not_found : bool = True ,
2023-07-17 19:19:19 +00:00
sleep_after_404 : float = 0.0 ,
2023-07-19 19:40:23 +00:00
hearthbeat_interval = 0 ,
2023-04-20 16:47:47 +00:00
) :
if proxies is None :
2023-09-10 14:27:09 +00:00
proxies = main_settings [ " proxies " ]
2023-04-20 16:47:47 +00:00
if header_values is None :
header_values = dict ( )
2023-04-22 12:20:19 +00:00
self . HEADER_VALUES = header_values
2023-04-20 17:37:41 +00:00
self . LOGGER = logger
2023-04-20 16:47:47 +00:00
self . HOST = urlparse ( host )
self . TRIES = tries
self . TIMEOUT = timeout
self . rotating_proxy = RotatingProxy ( proxy_list = proxies )
self . ACCEPTED_RESPONSE_CODES = accepted_response_codes or { 200 }
self . SEMANTIC_NOT_FOUND = semantic_not_found
2023-06-22 12:30:26 +00:00
self . sleep_after_404 = sleep_after_404
2023-04-20 16:47:47 +00:00
2023-04-23 10:08:39 +00:00
self . session = requests . Session ( )
self . session . headers = self . get_header ( * * self . HEADER_VALUES )
self . session . proxies = self . rotating_proxy . current_proxy
2023-04-20 16:47:47 +00:00
2023-07-17 19:19:19 +00:00
self . session_is_occupied : bool = False
self . hearthbeat_thread = None
2023-07-19 19:40:23 +00:00
self . hearthbeat_interval = hearthbeat_interval
2023-07-17 19:19:19 +00:00
2023-07-26 22:27:08 +00:00
@property
def user_agent ( self ) - > str :
return self . session . headers . get ( " user-agent " , " Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 " )
2023-07-17 19:19:19 +00:00
2023-07-19 19:40:23 +00:00
def start_hearthbeat ( self ) :
if self . hearthbeat_interval < = 0 :
self . LOGGER . warning ( f " Can ' t start a hearthbeat with { self . hearthbeat_interval } s in between. " )
self . hearthbeat_thread = threading . Thread ( target = self . _hearthbeat_loop , args = ( self . hearthbeat_interval , ) , daemon = True )
self . hearthbeat_thread . start ( )
2023-07-17 19:19:19 +00:00
def hearthbeat_failed ( self ) :
self . LOGGER . warning ( f " I just died... (The hearthbeat failed) " )
def hearthbeat ( self ) :
# Your code to send heartbeat requests goes here
print ( " the hearth is beating, but it needs to be implemented ;-; \n Fuck youuuu for setting hearthbeat in the constructor to true, but not implementing the method Connection.hearbeat() " )
def _hearthbeat_loop ( self , interval : float ) :
def hearthbeat_wrapper ( ) :
self . session_is_occupied = True
2023-07-28 07:24:14 +00:00
self . LOGGER . debug ( f " I am living. (sending a hearthbeat) " )
2023-07-17 19:19:19 +00:00
self . hearthbeat ( )
self . LOGGER . debug ( f " finished the hearthbeat " )
self . session_is_occupied = False
while True :
hearthbeat_wrapper ( )
time . sleep ( interval )
2023-04-22 12:20:19 +00:00
def base_url ( self , url : ParseResult = None ) :
if url is None :
url = self . HOST
2023-04-20 16:47:47 +00:00
2023-04-22 12:20:19 +00:00
return urlunsplit ( ( url . scheme , url . netloc , " " , " " , " " ) )
2023-04-20 16:47:47 +00:00
def get_header ( self , * * header_values ) - > Dict [ str , str ] :
return {
2023-04-23 10:08:39 +00:00
" user-agent " : " Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 " ,
2023-04-20 16:47:47 +00:00
" Connection " : " keep-alive " ,
2023-05-03 14:14:03 +00:00
# "Host": self.HOST.netloc,
2023-04-22 12:20:19 +00:00
" Referer " : self . base_url ( ) ,
2023-04-20 16:47:47 +00:00
* * header_values
}
def rotate ( self ) :
2023-04-23 10:08:39 +00:00
self . session . proxies = self . rotating_proxy . rotate ( )
2023-04-20 16:47:47 +00:00
2023-04-23 10:08:39 +00:00
def _update_headers (
self ,
headers : Optional [ dict ] ,
refer_from_origin : bool ,
url : ParseResult
) - > Dict [ str , str ] :
if headers is None :
headers = dict ( )
2023-04-22 12:20:19 +00:00
2023-04-23 10:08:39 +00:00
if not refer_from_origin :
headers [ " Referer " ] = self . base_url ( url = url )
2023-04-22 12:20:19 +00:00
2023-04-23 10:08:39 +00:00
return headers
2023-04-22 12:20:19 +00:00
2023-04-20 16:47:47 +00:00
def _request (
self ,
request : Callable ,
try_count : int ,
2023-06-16 21:27:08 +00:00
accepted_response_codes : set ,
2023-04-20 16:47:47 +00:00
url : str ,
2023-04-20 17:37:41 +00:00
timeout : float ,
2023-04-23 10:08:39 +00:00
headers : dict ,
refer_from_origin : bool = True ,
2023-05-03 13:16:01 +00:00
raw_url : bool = False ,
2023-06-22 12:30:26 +00:00
sleep_after_404 : float = None ,
2023-07-17 19:19:19 +00:00
is_hearthbeat : bool = False ,
2023-04-20 16:47:47 +00:00
* * kwargs
) - > Optional [ requests . Response ] :
2023-06-22 12:30:26 +00:00
if sleep_after_404 is None :
sleep_after_404 = self . sleep_after_404
2023-04-20 16:47:47 +00:00
if try_count > = self . TRIES :
return
2023-04-20 17:37:41 +00:00
if timeout is None :
timeout = self . TIMEOUT
2023-04-23 10:08:39 +00:00
parsed_url = urlparse ( url )
headers = self . _update_headers (
headers = headers ,
refer_from_origin = refer_from_origin ,
url = parsed_url
)
2023-05-03 13:16:01 +00:00
request_url = parsed_url . geturl ( ) if not raw_url else url
2023-05-03 14:14:03 +00:00
connection_failed = False
2023-04-20 16:47:47 +00:00
try :
2023-07-19 19:40:23 +00:00
if self . session_is_occupied and not is_hearthbeat :
self . LOGGER . info ( f " Waiting for the hearthbeat to finish. " )
while self . session_is_occupied and not is_hearthbeat :
pass
2023-07-17 19:19:19 +00:00
2023-05-03 14:14:03 +00:00
r : requests . Response = request ( request_url , timeout = timeout , headers = headers , * * kwargs )
2023-06-16 21:27:08 +00:00
if r . status_code in accepted_response_codes :
2023-05-03 14:14:03 +00:00
return r
if self . SEMANTIC_NOT_FOUND and r . status_code == 404 :
self . LOGGER . warning ( f " Couldn ' t find url (404): { request_url } " )
return None
2023-04-20 16:47:47 +00:00
except requests . exceptions . Timeout :
2023-05-03 13:16:01 +00:00
self . LOGGER . warning ( f " Request timed out at \" { request_url } \" : ( { try_count } - { self . TRIES } ) " )
2023-05-03 14:14:03 +00:00
connection_failed = True
2023-04-20 16:47:47 +00:00
except requests . exceptions . ConnectionError :
2023-05-03 13:16:01 +00:00
self . LOGGER . warning ( f " Couldn ' t connect to \" { request_url } \" : ( { try_count } - { self . TRIES } ) " )
2023-05-03 14:14:03 +00:00
connection_failed = True
2023-04-20 16:47:47 +00:00
2023-05-03 14:14:03 +00:00
if not connection_failed :
2023-04-20 16:47:47 +00:00
self . LOGGER . warning ( f " { self . HOST . netloc } responded wit { r . status_code } "
f " at { url } . ( { try_count } - { self . TRIES } ) " )
self . LOGGER . debug ( r . content )
2023-06-22 12:30:26 +00:00
if sleep_after_404 != 0 :
self . LOGGER . warning ( f " Waiting for { sleep_after_404 } seconds. " )
time . sleep ( sleep_after_404 )
2023-04-20 16:47:47 +00:00
self . rotate ( )
return self . _request (
request = request ,
2023-06-13 18:10:11 +00:00
try_count = try_count + 1 ,
2023-06-16 21:27:08 +00:00
accepted_response_codes = accepted_response_codes ,
2023-04-20 16:47:47 +00:00
url = url ,
2023-04-22 12:20:19 +00:00
timeout = timeout ,
2023-06-12 15:40:54 +00:00
headers = headers ,
2023-06-22 12:30:26 +00:00
sleep_after_404 = sleep_after_404 ,
2023-07-17 19:19:19 +00:00
is_hearthbeat = is_hearthbeat ,
2023-04-20 16:47:47 +00:00
* * kwargs
)
def get (
self ,
url : str ,
2023-04-22 12:20:19 +00:00
refer_from_origin : bool = True ,
2023-04-20 16:47:47 +00:00
stream : bool = False ,
accepted_response_codes : set = None ,
2023-04-20 17:37:41 +00:00
timeout : float = None ,
2023-04-23 10:08:39 +00:00
headers : dict = None ,
2023-05-03 13:16:01 +00:00
raw_url : bool = False ,
2023-04-20 16:47:47 +00:00
* * kwargs
) - > Optional [ requests . Response ] :
2023-06-16 21:27:08 +00:00
if accepted_response_codes is None :
accepted_response_codes = self . ACCEPTED_RESPONSE_CODES
2023-04-20 16:47:47 +00:00
r = self . _request (
2023-04-23 10:08:39 +00:00
request = self . session . get ,
2023-04-20 16:47:47 +00:00
try_count = 0 ,
2023-06-16 21:27:08 +00:00
accepted_response_codes = accepted_response_codes ,
2023-04-20 16:47:47 +00:00
url = url ,
2023-04-20 17:37:41 +00:00
timeout = timeout ,
2023-04-23 10:08:39 +00:00
headers = headers ,
2023-05-03 13:16:01 +00:00
raw_url = raw_url ,
2023-05-03 14:14:03 +00:00
refer_from_origin = refer_from_origin ,
stream = stream ,
2023-04-20 16:47:47 +00:00
* * kwargs
)
if r is None :
self . LOGGER . warning ( f " Max attempts ( { self . TRIES } ) exceeded for: GET: { url } " )
return r
def post (
self ,
url : str ,
2023-09-10 21:12:49 +00:00
json : dict = None ,
2023-04-22 12:20:19 +00:00
refer_from_origin : bool = True ,
2023-04-20 16:47:47 +00:00
stream : bool = False ,
accepted_response_codes : set = None ,
2023-04-20 17:37:41 +00:00
timeout : float = None ,
2023-04-23 10:08:39 +00:00
headers : dict = None ,
2023-05-03 13:16:01 +00:00
raw_url : bool = False ,
2023-04-20 16:47:47 +00:00
* * kwargs
) - > Optional [ requests . Response ] :
r = self . _request (
2023-04-23 10:08:39 +00:00
request = self . session . post ,
2023-04-20 16:47:47 +00:00
try_count = 0 ,
2023-06-16 21:27:08 +00:00
accepted_response_codes = accepted_response_codes or self . ACCEPTED_RESPONSE_CODES ,
2023-04-20 16:47:47 +00:00
url = url ,
2023-04-20 17:37:41 +00:00
timeout = timeout ,
2023-04-23 10:08:39 +00:00
headers = headers ,
refer_from_origin = refer_from_origin ,
2023-05-03 14:14:03 +00:00
raw_url = raw_url ,
2023-04-20 16:47:47 +00:00
json = json ,
stream = stream ,
* * kwargs
)
if r is None :
self . LOGGER . warning ( f " Max attempts ( { self . TRIES } ) exceeded for: GET: { url } " )
self . LOGGER . warning ( f " payload: { json } " )
return r
2023-06-15 16:22:00 +00:00
def stream_into (
self ,
url : str ,
target : Target ,
description : str = " download " ,
refer_from_origin : bool = True ,
accepted_response_codes : set = None ,
timeout : float = None ,
headers : dict = None ,
raw_url : bool = False ,
2023-09-10 14:27:09 +00:00
chunk_size : int = main_settings [ " chunk_size " ] ,
2023-06-16 10:26:02 +00:00
try_count : int = 0 ,
progress : int = 0 ,
2023-06-15 16:22:00 +00:00
* * kwargs
2023-06-16 10:26:02 +00:00
) - > DownloadResult :
2023-06-16 21:27:08 +00:00
if progress > 0 :
if headers is None :
headers = dict ( )
headers [ " Range " ] = f " bytes= { target . size } - "
if accepted_response_codes is None :
accepted_response_codes = self . ACCEPTED_RESPONSE_CODES
2023-06-16 10:26:02 +00:00
2023-06-15 16:22:00 +00:00
r = self . _request (
request = self . session . get ,
try_count = 0 ,
2023-06-16 21:27:08 +00:00
accepted_response_codes = accepted_response_codes ,
2023-06-15 16:22:00 +00:00
url = url ,
timeout = timeout ,
headers = headers ,
raw_url = raw_url ,
refer_from_origin = refer_from_origin ,
stream = True ,
* * kwargs
)
if r is None :
2023-06-16 10:26:02 +00:00
return DownloadResult ( error_message = f " Could not establish connection to: { url } " )
2023-06-15 16:22:00 +00:00
target . create_path ( )
total_size = int ( r . headers . get ( ' content-length ' ) )
2023-06-16 10:26:02 +00:00
progress = 0
retry = False
2023-06-15 16:22:00 +00:00
2023-06-16 21:27:08 +00:00
with target . open ( " ab " ) as f :
"""
https : / / en . wikipedia . org / wiki / Kilobyte
> The internationally recommended unit symbol for the kilobyte is kB .
"""
2023-06-16 10:26:02 +00:00
2023-06-16 21:27:08 +00:00
with tqdm ( total = total_size - target . size , unit = ' B ' , unit_scale = True , unit_divisor = 1024 , desc = description ) as t :
try :
2023-06-16 10:26:02 +00:00
for chunk in r . iter_content ( chunk_size = chunk_size ) :
2023-06-15 16:22:00 +00:00
size = f . write ( chunk )
2023-06-16 10:26:02 +00:00
progress + = size
2023-06-15 16:22:00 +00:00
t . update ( size )
2023-06-16 21:27:08 +00:00
except requests . exceptions . ConnectionError :
if try_count > = self . TRIES :
self . LOGGER . warning ( f " Stream timed out at \" { url } \" : to many retries, aborting. " )
return DownloadResult ( error_message = f " Stream timed out from { url } , reducing the chunksize might help. " )
self . LOGGER . warning ( f " Stream timed out at \" { url } \" : ( { try_count } - { self . TRIES } ) " )
retry = True
if total_size > progress :
2023-06-16 10:26:02 +00:00
retry = True
2023-06-16 21:27:08 +00:00
if retry :
self . LOGGER . warning ( f " Retrying stream... " )
accepted_response_codes . add ( 206 )
return self . stream_into (
url = url ,
target = target ,
description = description ,
try_count = try_count + 1 ,
progress = progress ,
accepted_response_codes = accepted_response_codes ,
timeout = timeout ,
headers = headers ,
raw_url = raw_url ,
refer_from_origin = refer_from_origin ,
chunk_size = chunk_size ,
* * kwargs
)
return DownloadResult ( )