diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index 7a74dbc..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1,50 +0,0 @@ -# CLAUDE.md - -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - -## Project Overview - -A Python utility that synchronizes SharePoint Online folders to local storage using the Microsoft Graph API. Offers both a CLI (`download_sharepoint.py`) and a modern GUI (`sharepoint_gui.py`). - -## Running the Tool - -```bash -# Install dependencies -pip install -r requirements.txt - -# GUI mode (recommended for interactive use) -python sharepoint_gui.py - -# CLI mode (for automation/scripting) -python download_sharepoint.py -``` - -Configuration is read from `connection_info.txt` (gitignored — copy from `connection_info.template.txt` and fill in credentials). - -## Architecture - -Two-file structure with clear separation of concerns: - -**`download_sharepoint.py`** — Core engine with four logical layers: -1. **Authentication** — MSAL `ConfidentialClientApplication` using OAuth 2.0 Client Credentials flow. Tokens are refreshed via `force_refresh=True` when a 401 is received. -2. **Graph API navigation** — `get_site_id()` → `get_drive_id()` → `process_item_list()` (recursive, handles `@odata.nextLink` pagination). -3. **Download & resilience** — `download_single_file()` with Range header support for resumable downloads. `get_fresh_download_url()` handles expired pre-signed URLs and includes its own 429 detection and exponential backoff (`2^attempt` seconds). The `@retry_request` decorator provides the same for all other API calls (up to 5 retries). -4. **Concurrency** — `ThreadPoolExecutor` (max 5 workers). A `report_lock` guards the shared error list. A `stop_event` allows the GUI stop button to cancel in-flight work. -5. **Folder depth guard** — `process_item_list()` accepts a `depth` parameter and stops recursion at `MAX_FOLDER_DEPTH = 50`, logging a warning for any skipped subtrees. - -**`sharepoint_gui.py`** — CustomTkinter wrapper that: -- Persists settings to a local JSON file -- Spawns the core engine in a background thread -- Patches `requests.get` to route through the GUI's log display -- Provides a folder browser for `LOCAL_PATH` - -## Key Behaviors to Preserve - -- **Self-healing sessions**: On 401, the code refreshes both the MSAL access token *and* the pre-signed Graph download URL before retrying — these are two separate expiry mechanisms. -- **Resumable downloads**: Files are downloaded in 1 MB chunks using HTTP Range headers. Existing files are skipped if their size matches; partial files are resumed from the last byte. -- **Stop signal**: `stop_event.is_set()` is checked in the download loop and recursive traversal — any new code that loops must respect this. - -## Output - -- `sharepoint_download.log` — Full operation log -- `download_report_YYYYMMDD_HHMMSS.csv` — Per-run error report (gitignored) diff --git a/download_sharepoint.py b/download_sharepoint.py index b3470f7..6bc81ef 100644 --- a/download_sharepoint.py +++ b/download_sharepoint.py @@ -22,10 +22,6 @@ CHUNK_SIZE = 1024 * 1024 # 1MB Chunks MAX_FOLDER_DEPTH = 50 LOG_FILE = "sharepoint_download.log" -# Hash Validation Settings -ENABLE_HASH_VALIDATION = True # Set to False to only check file size -HASH_THRESHOLD_MB = 30 # Only hash files smaller than this (in MB) - # Setup Logging logging.basicConfig( level=logging.INFO, @@ -46,9 +42,12 @@ def format_size(size_bytes): return f"{size_bytes:.2f} EB" def get_long_path(path): - """Handles Windows Long Path limitation by prefixing with \\?\\ for absolute paths.""" + r"""Handles Windows Long Path limitation by prefixing with \\?\ for absolute paths. + Correctly handles UNC paths (e.g. \\server\share -> \\?\UNC\server\share).""" path = os.path.abspath(path) if os.name == 'nt' and not path.startswith("\\\\?\\"): + if path.startswith("\\\\"): + return "\\\\?\\UNC\\" + path[2:] return "\\\\?\\" + path return path @@ -61,6 +60,21 @@ def load_config(file_path): if '=' in line: key, value = line.split('=', 1) config[key.strip()] = value.strip().strip('"') + + # Parse numeric and boolean values + if 'ENABLE_HASH_VALIDATION' in config: + config['ENABLE_HASH_VALIDATION'] = config['ENABLE_HASH_VALIDATION'].lower() == 'true' + else: + config['ENABLE_HASH_VALIDATION'] = True + + if 'HASH_THRESHOLD_MB' in config: + try: + config['HASH_THRESHOLD_MB'] = int(config['HASH_THRESHOLD_MB']) + except ValueError: + config['HASH_THRESHOLD_MB'] = 30 + else: + config['HASH_THRESHOLD_MB'] = 30 + return config # --- Punkt 1: Exponential Backoff & Retry Logic --- @@ -140,16 +154,17 @@ def quickxorhash(file_path): result = h.to_bytes(20, byteorder='little') return base64.b64encode(result).decode('ascii') -def verify_integrity(local_path, remote_hash): - """Verifies file integrity based on global settings.""" - if not remote_hash or not ENABLE_HASH_VALIDATION: +def verify_integrity(local_path, remote_hash, config): + """Verifies file integrity based on config settings.""" + if not remote_hash or not config.get('ENABLE_HASH_VALIDATION', True): return True file_size = os.path.getsize(get_long_path(local_path)) - threshold_bytes = HASH_THRESHOLD_MB * 1024 * 1024 + threshold_mb = config.get('HASH_THRESHOLD_MB', 30) + threshold_bytes = threshold_mb * 1024 * 1024 if file_size > threshold_bytes: - logger.info(f"Skipping hash check (size > {HASH_THRESHOLD_MB}MB): {os.path.basename(local_path)}") + logger.info(f"Skipping hash check (size > {threshold_mb}MB): {os.path.basename(local_path)}") return True local_hash = quickxorhash(local_path) @@ -241,8 +256,11 @@ def get_fresh_download_url(app, drive_id, item_id): return None, "Item returned but '@microsoft.graph.downloadUrl' was missing after 3 attempts." -def download_single_file(app, drive_id, item_id, local_path, expected_size, display_name, remote_hash=None, initial_url=None, remote_mtime_str=None): +def download_single_file(app, drive_id, item_id, local_path, expected_size, display_name, config, stop_event=None, remote_hash=None, initial_url=None, remote_mtime_str=None): try: + if stop_event and stop_event.is_set(): + raise InterruptedError("Sync cancelled") + file_mode = 'wb' resume_header = {} existing_size = 0 @@ -260,12 +278,8 @@ def download_single_file(app, drive_id, item_id, local_path, expected_size, disp # Hvis filen findes, har rigtig størrelse OG lokal er ikke ældre end remote -> SKIP if existing_size == expected_size: if local_mtime >= (remote_mtime - 1): # Vi tillader 1 sekuds difference pga. filsystem-præcision - if verify_integrity(local_path, remote_hash): - logger.info(f"Skipped (up-to-date): {display_name}") - return True, None - else: - logger.warning(f"Hash mismatch on existing file: {display_name}. Re-downloading.") - existing_size = 0 + logger.info(f"Skipped (up-to-date): {display_name}") + return True, None else: logger.info(f"Update available: {display_name} (Remote is newer)") existing_size = 0 @@ -306,13 +320,15 @@ def download_single_file(app, drive_id, item_id, local_path, expected_size, disp with open(long_local_path, file_mode) as f: for chunk in response.iter_content(chunk_size=CHUNK_SIZE): + if stop_event and stop_event.is_set(): + raise InterruptedError("Sync cancelled") if chunk: f.write(chunk) # Post-download check final_size = os.path.getsize(long_local_path) if final_size == expected_size: - if verify_integrity(local_path, remote_hash): + if verify_integrity(local_path, remote_hash, config): logger.info(f"DONE: {display_name}") return True, None else: @@ -320,15 +336,20 @@ def download_single_file(app, drive_id, item_id, local_path, expected_size, disp else: return False, f"Size mismatch: Remote={expected_size}, Local={final_size}" + except InterruptedError: + raise except Exception as e: return False, str(e) # --- Main Traversal Logic --- -def process_item_list(app, drive_id, item_path, local_root_path, report, executor, futures, depth=0): +def process_item_list(app, drive_id, item_path, local_root_path, report, executor, futures, config, stop_event=None, depth=0): if depth >= MAX_FOLDER_DEPTH: logger.warning(f"Max folder depth ({MAX_FOLDER_DEPTH}) reached at: {item_path}. Skipping subtree.") return try: + if stop_event and stop_event.is_set(): + raise InterruptedError("Sync cancelled") + encoded_path = quote(item_path) if not item_path: @@ -342,12 +363,15 @@ def process_item_list(app, drive_id, item_path, local_root_path, report, executo items = data.get('value', []) for item in items: + if stop_event and stop_event.is_set(): + raise InterruptedError("Sync cancelled") + item_name = item['name'] local_path = os.path.join(local_root_path, item_name) display_path = f"{item_path}/{item_name}".strip('/') if 'folder' in item: - process_item_list(app, drive_id, display_path, local_path, report, executor, futures, depth + 1) + process_item_list(app, drive_id, display_path, local_path, report, executor, futures, config, stop_event, depth + 1) elif 'file' in item: item_id = item['id'] download_url = item.get('@microsoft.graph.downloadUrl') @@ -358,12 +382,14 @@ def process_item_list(app, drive_id, item_path, local_root_path, report, executo download_single_file, app, drive_id, item_id, local_path, item['size'], display_path, - remote_hash, download_url, remote_mtime + config, stop_event, remote_hash, download_url, remote_mtime ) futures[future] = display_path url = data.get('@odata.nextLink') + except InterruptedError: + raise except Exception as e: logger.error(f"Error traversing {item_path}: {e}") with report_lock: @@ -374,9 +400,11 @@ def create_msal_app(tenant_id, client_id, client_secret): client_id, authority=f"https://login.microsoftonline.com/{tenant_id}", client_credential=client_secret ) -def main(): +def main(config=None, stop_event=None): try: - config = load_config('connection_info.txt') + if config is None: + config = load_config('connection_info.txt') + tenant_id = config.get('TENANT_ID', '') client_id = config.get('CLIENT_ID', '') client_secret = config.get('CLIENT_SECRET', '') @@ -385,17 +413,6 @@ def main(): folders_str = config.get('FOLDERS_TO_DOWNLOAD', '') local_base = config.get('LOCAL_PATH', '').replace('\\', os.sep) - # Opdater globale indstillinger fra config hvis de findes - global ENABLE_HASH_VALIDATION, HASH_THRESHOLD_MB - if 'ENABLE_HASH_VALIDATION' in config: - # Vi tjekker om strengen er 'true' (case-insensitive) - ENABLE_HASH_VALIDATION = config['ENABLE_HASH_VALIDATION'].lower() == 'true' - if 'HASH_THRESHOLD_MB' in config: - try: - HASH_THRESHOLD_MB = int(config['HASH_THRESHOLD_MB']) - except ValueError: - logger.warning(f"Ugyldig værdi for HASH_THRESHOLD_MB i config: {config['HASH_THRESHOLD_MB']}. Bruger standard: {HASH_THRESHOLD_MB}") - folders = [f.strip() for f in folders_str.split(',') if f.strip()] or [""] logger.info("Initializing SharePoint Production Sync Tool...") @@ -407,18 +424,29 @@ def main(): with ThreadPoolExecutor(max_workers=MAX_WORKERS, thread_name_prefix="DL") as executor: futures = {} for folder in folders: + if stop_event and stop_event.is_set(): + break logger.info(f"Scanning: {folder or 'Root'}") - process_item_list(app, drive_id, folder, os.path.join(local_base, folder), report, executor, futures) + process_item_list(app, drive_id, folder, os.path.join(local_base, folder), report, executor, futures, config, stop_event) logger.info(f"Scan complete. Processing {len(futures)} tasks...") for future in as_completed(futures): + if stop_event and stop_event.is_set(): + break path = futures[future] - success, error = future.result() - if not success: - logger.error(f"FAILED: {path} | {error}") - with report_lock: - report.append({"Path": path, "Error": error, "Timestamp": datetime.now().isoformat()}) + try: + success, error = future.result() + if not success: + logger.error(f"FAILED: {path} | {error}") + with report_lock: + report.append({"Path": path, "Error": error, "Timestamp": datetime.now().isoformat()}) + except InterruptedError: + continue # The executor will shut down anyway + if stop_event and stop_event.is_set(): + logger.warning("Synchronization was stopped by user.") + return + report_file = f"download_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" with open(report_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=["Path", "Error", "Timestamp"]) @@ -427,6 +455,8 @@ def main(): logger.info(f"Sync complete. Errors: {len(report)}. Report: {report_file}") + except InterruptedError: + logger.warning("Synchronization was stopped by user.") except Exception as e: logger.critical(f"FATAL ERROR: {e}") diff --git a/sharepoint_gui.py b/sharepoint_gui.py index 6911b5b..2afbfe1 100644 --- a/sharepoint_gui.py +++ b/sharepoint_gui.py @@ -9,16 +9,6 @@ import requests # --- Global Stop Flag --- stop_event = threading.Event() -# For at stoppe uden at ændre download_sharepoint.py, "patcher" vi requests.get -# så den tjekker stop_event før hver anmodning. -original_get = requests.get -def patched_get(*args, **kwargs): - if stop_event.is_set(): - raise InterruptedError("Synkronisering afbrudt af brugeren.") - return original_get(*args, **kwargs) - -requests.get = patched_get - # --- Logging Handler for GUI --- class TextboxHandler(logging.Handler): def __init__(self, textbox): @@ -149,7 +139,8 @@ class SharepointApp(ctk.CTk): def run_sync(self): try: - download_sharepoint.main() + config = download_sharepoint.load_config("connection_info.txt") + download_sharepoint.main(config=config, stop_event=stop_event) if stop_event.is_set(): self.status_label.configure(text="Status: Afbrudt", text_color="red") else: