import os import csv import requests import time import threading import logging import base64 import struct from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from msal import ConfidentialClientApplication from urllib.parse import urlparse, quote # --- Production Configuration --- MAX_WORKERS = 5 MAX_RETRIES = 5 CHUNK_SIZE = 1024 * 1024 # 1MB Chunks LOG_FILE = "sharepoint_download.log" # Setup Logging logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(threadName)s: %(message)s', handlers=[ logging.FileHandler(LOG_FILE, encoding='utf-8'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) report_lock = threading.Lock() def format_size(size_bytes): for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']: if size_bytes < 1024.0: return f"{size_bytes:.2f} {unit}" size_bytes /= 1024.0 return f"{size_bytes:.2f} EB" def load_config(file_path): config = {} if not os.path.exists(file_path): raise FileNotFoundError(f"Configuration file {file_path} not found.") with open(file_path, 'r', encoding='utf-8') as f: for line in f: if '=' in line: key, value = line.split('=', 1) config[key.strip()] = value.strip().strip('"') return config # --- Punkt 1: Exponential Backoff & Retry Logic --- def retry_request(func): def wrapper(*args, **kwargs): retries = 0 while retries < MAX_RETRIES: try: response = func(*args, **kwargs) if response.status_code == 429: retry_after = int(response.headers.get("Retry-After", 2 ** retries)) logger.warning(f"Throttled (429). Waiting {retry_after}s...") time.sleep(retry_after) retries += 1 continue response.raise_for_status() return response except requests.exceptions.RequestException as e: retries += 1 wait = 2 ** retries if retries >= MAX_RETRIES: raise e logger.error(f"Request failed: {e}. Retrying in {wait}s...") time.sleep(wait) raise requests.exceptions.RetryError(f"Max retries ({MAX_RETRIES}) exceeded.") return wrapper @retry_request def safe_get(url, headers, stream=False, timeout=60, params=None): return requests.get(url, headers=headers, stream=stream, timeout=timeout, params=params) # --- Punkt 4: Integrity Validation (QuickXorHash) --- def quickxorhash(file_path): """Compute Microsoft QuickXorHash for a file. Returns base64-encoded string. Uses 3 × uint64 cells matching Microsoft's C# reference implementation.""" SHIFT = 11 WIDTH = 160 data = [0, 0, 0] # 3 × 64-bit unsigned integers i = 0 with open(file_path, 'rb') as f: while True: chunk = f.read(CHUNK_SIZE) if not chunk: break for byte in chunk: bit_idx = (i * SHIFT) % WIDTH cell = bit_idx // 64 shift = bit_idx % 64 data[cell] = (data[cell] ^ (byte << shift)) & 0xFFFFFFFFFFFFFFFF i += 1 result = struct.pack('