Files
Sharepoint-Download-Tool/download_sharepoint.py
Martin Tranberg 634b5ff151 Tilføj 429-håndtering, eksponentiel backoff og dybdebegrænsning
- get_fresh_download_url: tilføjer 429-tjek med Retry-After og erstatter
  fast sleep(1) med eksponentiel backoff (2^attempt sekunder)
- process_item_list: tilføjer MAX_FOLDER_DEPTH=50 guard mod RecursionError
  ved unormalt dybe SharePoint-mappestrukturer
- README og CLAUDE.md opdateret med beskrivelse af nye adfærd

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 15:16:12 +01:00

345 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import csv
import requests
import time
import threading
import logging
import base64
import struct
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from msal import ConfidentialClientApplication
from urllib.parse import urlparse, quote
# --- Production Configuration ---
MAX_WORKERS = 5
MAX_RETRIES = 5
CHUNK_SIZE = 1024 * 1024 # 1MB Chunks
MAX_FOLDER_DEPTH = 50
LOG_FILE = "sharepoint_download.log"
# Setup Logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(threadName)s: %(message)s',
handlers=[
logging.FileHandler(LOG_FILE, encoding='utf-8'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
report_lock = threading.Lock()
def format_size(size_bytes):
for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
if size_bytes < 1024.0:
return f"{size_bytes:.2f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.2f} EB"
def load_config(file_path):
config = {}
if not os.path.exists(file_path):
raise FileNotFoundError(f"Configuration file {file_path} not found.")
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
if '=' in line:
key, value = line.split('=', 1)
config[key.strip()] = value.strip().strip('"')
return config
# --- Punkt 1: Exponential Backoff & Retry Logic ---
def retry_request(func):
def wrapper(*args, **kwargs):
retries = 0
while retries < MAX_RETRIES:
try:
response = func(*args, **kwargs)
if response.status_code == 429:
retry_after = int(response.headers.get("Retry-After", 2 ** retries))
logger.warning(f"Throttled (429). Waiting {retry_after}s...")
time.sleep(retry_after)
retries += 1
continue
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
retries += 1
wait = 2 ** retries
if retries >= MAX_RETRIES:
raise e
logger.error(f"Request failed: {e}. Retrying in {wait}s...")
time.sleep(wait)
raise requests.exceptions.RetryError(f"Max retries ({MAX_RETRIES}) exceeded.")
return wrapper
@retry_request
def safe_get(url, headers, stream=False, timeout=60, params=None):
return requests.get(url, headers=headers, stream=stream, timeout=timeout, params=params)
# --- Punkt 4: Integrity Validation (QuickXorHash) ---
def quickxorhash(file_path):
"""Compute Microsoft QuickXorHash for a file. Returns base64-encoded string.
Uses 3 × uint64 cells matching Microsoft's C# reference implementation."""
SHIFT = 11
WIDTH = 160
data = [0, 0, 0] # 3 × 64-bit unsigned integers
i = 0
with open(file_path, 'rb') as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
for byte in chunk:
bit_idx = (i * SHIFT) % WIDTH
cell = bit_idx // 64
shift = bit_idx % 64
data[cell] = (data[cell] ^ (byte << shift)) & 0xFFFFFFFFFFFFFFFF
i += 1
result = struct.pack('<QQQ', data[0], data[1], data[2])
return base64.b64encode(result[:20]).decode('ascii')
def verify_integrity(local_path, remote_hash):
"""Verifies file integrity using Microsoft QuickXorHash."""
if not remote_hash:
return True # Intet hash fra remote; fald tilbage til størrelses-check
local_hash = quickxorhash(local_path)
if local_hash != remote_hash:
logger.warning(f"Hash mismatch for {local_path}: local={local_hash}, remote={remote_hash}")
return False
return True
def get_headers(app, force_refresh=False):
scopes = ["https://graph.microsoft.com/.default"]
# If force_refresh is True, we don't rely on the cache
result = None
if not force_refresh:
result = app.acquire_token_for_client(scopes=scopes)
if force_refresh or not result or "access_token" not in result:
logger.info("Refreshing Access Token...")
result = app.acquire_token_for_client(scopes=scopes, force_refresh=True)
if "access_token" in result:
return {'Authorization': f'Bearer {result["access_token"]}'}
raise Exception(f"Auth failed: {result.get('error_description')}")
def get_site_id(app, site_url):
parsed = urlparse(site_url)
url = f"https://graph.microsoft.com/v1.0/sites/{parsed.netloc}:{parsed.path}"
response = safe_get(url, headers=get_headers(app))
return response.json()['id']
def get_drive_id(app, site_id, drive_name):
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
response = safe_get(url, headers=get_headers(app))
for drive in response.json().get('value', []):
if drive['name'] == drive_name: return drive['id']
raise Exception(f"Drive {drive_name} not found")
# --- Punkt 2: Resume / Chunked Download logic ---
def get_fresh_download_url(app, drive_id, item_id):
"""Fetches a fresh download URL for a specific item ID with retries and robust error handling."""
url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}"
for attempt in range(3):
try:
headers = get_headers(app)
response = requests.get(url, headers=headers, timeout=60)
if response.status_code == 429:
retry_after = int(response.headers.get("Retry-After", 2 ** attempt))
logger.warning(f"Throttled (429) in get_fresh_download_url. Waiting {retry_after}s...")
time.sleep(retry_after)
continue
if response.status_code == 401:
logger.info(f"Access Token expired during refresh (Attempt {attempt+1}). Forcing refresh...")
headers = get_headers(app, force_refresh=True)
response = requests.get(url, headers=headers, timeout=60)
response.raise_for_status()
data = response.json()
download_url = data.get('@microsoft.graph.downloadUrl')
if download_url:
return download_url, None
# If item exists but URL is missing, it might be a transient SharePoint issue
logger.warning(f"Attempt {attempt+1}: '@microsoft.graph.downloadUrl' missing for {item_id}. Retrying in {2 ** attempt}s...")
time.sleep(2 ** attempt)
except Exception as e:
if attempt == 2:
return None, str(e)
logger.warning(f"Attempt {attempt+1} failed: {e}. Retrying in {2 ** attempt}s...")
time.sleep(2 ** attempt)
return None, "Item returned but '@microsoft.graph.downloadUrl' was missing after 3 attempts."
def download_single_file(app, drive_id, item_id, local_path, expected_size, display_name, remote_hash=None, initial_url=None):
try:
file_mode = 'wb'
resume_header = {}
existing_size = 0
download_url = initial_url
if os.path.exists(local_path):
existing_size = os.path.getsize(local_path)
if existing_size == expected_size:
if verify_integrity(local_path, remote_hash):
logger.info(f"Skipped (complete): {display_name}")
return True, None
else:
logger.warning(f"Hash mismatch on existing file: {display_name}. Re-downloading.")
existing_size = 0
elif existing_size < expected_size:
logger.info(f"Resuming: {display_name} from {format_size(existing_size)}")
resume_header = {'Range': f'bytes={existing_size}-'}
file_mode = 'ab'
else:
logger.warning(f"Local file larger than remote: {display_name}. Overwriting.")
existing_size = 0
logger.info(f"Starting: {display_name} ({format_size(expected_size)})")
os.makedirs(os.path.dirname(local_path), exist_ok=True)
# Initial download attempt
if not download_url:
download_url, err = get_fresh_download_url(app, drive_id, item_id)
if not download_url:
return False, f"Could not fetch initial URL: {err}"
try:
response = safe_get(download_url, resume_header, stream=True, timeout=120)
except requests.exceptions.HTTPError as e:
if e.response is not None and e.response.status_code == 401:
# Handle 401 Unauthorized from SharePoint (expired download link)
logger.warning(f"URL expired for {display_name}. Fetching fresh URL...")
download_url, err = get_fresh_download_url(app, drive_id, item_id)
if not download_url:
return False, f"Failed to refresh download URL: {err}"
response = safe_get(download_url, resume_header, stream=True, timeout=120)
else:
raise
with open(local_path, file_mode) as f:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
if chunk:
f.write(chunk)
# Post-download check
final_size = os.path.getsize(local_path)
if final_size == expected_size:
if verify_integrity(local_path, remote_hash):
logger.info(f"DONE: {display_name}")
return True, None
else:
return False, "Integrity check failed (Hash mismatch)"
else:
return False, f"Size mismatch: Remote={expected_size}, Local={final_size}"
except Exception as e:
return False, str(e)
# --- Main Traversal Logic ---
def process_item_list(app, drive_id, item_path, local_root_path, report, executor, futures, depth=0):
if depth >= MAX_FOLDER_DEPTH:
logger.warning(f"Max folder depth ({MAX_FOLDER_DEPTH}) reached at: {item_path}. Skipping subtree.")
return
try:
auth_headers = get_headers(app)
encoded_path = quote(item_path)
if not item_path:
url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root/children"
else:
url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{encoded_path}:/children"
while url:
response = safe_get(url, headers=auth_headers)
data = response.json()
items = data.get('value', [])
for item in items:
item_name = item['name']
local_path = os.path.join(local_root_path, item_name)
display_path = f"{item_path}/{item_name}".strip('/')
if 'folder' in item:
process_item_list(app, drive_id, display_path, local_path, report, executor, futures, depth + 1)
elif 'file' in item:
item_id = item['id']
download_url = item.get('@microsoft.graph.downloadUrl')
remote_hash = item.get('file', {}).get('hashes', {}).get('quickXorHash')
future = executor.submit(
download_single_file,
app, drive_id, item_id,
local_path, item['size'], display_path,
remote_hash, download_url
)
futures[future] = display_path
url = data.get('@odata.nextLink')
if url:
auth_headers = get_headers(app)
except Exception as e:
logger.error(f"Error traversing {item_path}: {e}")
with report_lock:
report.append({"Path": item_path, "Error": str(e), "Timestamp": datetime.now().isoformat()})
def create_msal_app(tenant_id, client_id, client_secret):
return ConfidentialClientApplication(
client_id, authority=f"https://login.microsoftonline.com/{tenant_id}", client_credential=client_secret
)
def main():
try:
config = load_config('connection_info.txt')
tenant_id = config.get('TENANT_ID', '')
client_id = config.get('CLIENT_ID', '')
client_secret = config.get('CLIENT_SECRET', '')
site_url = config.get('SITE_URL', '')
drive_name = config.get('DOCUMENT_LIBRARY', '')
folders_str = config.get('FOLDERS_TO_DOWNLOAD', '')
local_base = config.get('LOCAL_PATH', '').replace('\\', os.sep)
folders = [f.strip() for f in folders_str.split(',') if f.strip()] or [""]
logger.info("Initializing SharePoint Production Sync Tool...")
app = create_msal_app(tenant_id, client_id, client_secret)
site_id = get_site_id(app, site_url)
drive_id = get_drive_id(app, site_id, drive_name)
report = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS, thread_name_prefix="DL") as executor:
futures = {}
for folder in folders:
logger.info(f"Scanning: {folder or 'Root'}")
process_item_list(app, drive_id, folder, os.path.join(local_base, folder), report, executor, futures)
logger.info(f"Scan complete. Processing {len(futures)} tasks...")
for future in as_completed(futures):
path = futures[future]
success, error = future.result()
if not success:
logger.error(f"FAILED: {path} | {error}")
with report_lock:
report.append({"Path": path, "Error": error, "Timestamp": datetime.now().isoformat()})
report_file = f"download_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
with open(report_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=["Path", "Error", "Timestamp"])
writer.writeheader()
writer.writerows(report)
logger.info(f"Sync complete. Errors: {len(report)}. Report: {report_file}")
except Exception as e:
logger.critical(f"FATAL ERROR: {e}")
if __name__ == "__main__":
main()