Improve cancellation logic and sync performance.

- Implement explicit threading.Event propagation for robust GUI cancellation.
- Optimize file synchronization by skipping hash validation for up-to-date files (matching size and timestamp).
- Update Windows long path support to correctly handle UNC network shares.
- Refactor configuration management to eliminate global state and improve modularity.
- Remove requests.get monkey-patch in GUI.
- Delete CLAUDE.md as it is no longer required.
This commit is contained in:
Martin Tranberg
2026-04-12 12:44:43 +02:00
parent 8899afabbc
commit 8e8bb3baa1
3 changed files with 72 additions and 101 deletions

View File

@@ -22,10 +22,6 @@ CHUNK_SIZE = 1024 * 1024 # 1MB Chunks
MAX_FOLDER_DEPTH = 50
LOG_FILE = "sharepoint_download.log"
# Hash Validation Settings
ENABLE_HASH_VALIDATION = True # Set to False to only check file size
HASH_THRESHOLD_MB = 30 # Only hash files smaller than this (in MB)
# Setup Logging
logging.basicConfig(
level=logging.INFO,
@@ -46,9 +42,12 @@ def format_size(size_bytes):
return f"{size_bytes:.2f} EB"
def get_long_path(path):
"""Handles Windows Long Path limitation by prefixing with \\?\\ for absolute paths."""
r"""Handles Windows Long Path limitation by prefixing with \\?\ for absolute paths.
Correctly handles UNC paths (e.g. \\server\share -> \\?\UNC\server\share)."""
path = os.path.abspath(path)
if os.name == 'nt' and not path.startswith("\\\\?\\"):
if path.startswith("\\\\"):
return "\\\\?\\UNC\\" + path[2:]
return "\\\\?\\" + path
return path
@@ -61,6 +60,21 @@ def load_config(file_path):
if '=' in line:
key, value = line.split('=', 1)
config[key.strip()] = value.strip().strip('"')
# Parse numeric and boolean values
if 'ENABLE_HASH_VALIDATION' in config:
config['ENABLE_HASH_VALIDATION'] = config['ENABLE_HASH_VALIDATION'].lower() == 'true'
else:
config['ENABLE_HASH_VALIDATION'] = True
if 'HASH_THRESHOLD_MB' in config:
try:
config['HASH_THRESHOLD_MB'] = int(config['HASH_THRESHOLD_MB'])
except ValueError:
config['HASH_THRESHOLD_MB'] = 30
else:
config['HASH_THRESHOLD_MB'] = 30
return config
# --- Punkt 1: Exponential Backoff & Retry Logic ---
@@ -140,16 +154,17 @@ def quickxorhash(file_path):
result = h.to_bytes(20, byteorder='little')
return base64.b64encode(result).decode('ascii')
def verify_integrity(local_path, remote_hash):
"""Verifies file integrity based on global settings."""
if not remote_hash or not ENABLE_HASH_VALIDATION:
def verify_integrity(local_path, remote_hash, config):
"""Verifies file integrity based on config settings."""
if not remote_hash or not config.get('ENABLE_HASH_VALIDATION', True):
return True
file_size = os.path.getsize(get_long_path(local_path))
threshold_bytes = HASH_THRESHOLD_MB * 1024 * 1024
threshold_mb = config.get('HASH_THRESHOLD_MB', 30)
threshold_bytes = threshold_mb * 1024 * 1024
if file_size > threshold_bytes:
logger.info(f"Skipping hash check (size > {HASH_THRESHOLD_MB}MB): {os.path.basename(local_path)}")
logger.info(f"Skipping hash check (size > {threshold_mb}MB): {os.path.basename(local_path)}")
return True
local_hash = quickxorhash(local_path)
@@ -241,8 +256,11 @@ def get_fresh_download_url(app, drive_id, item_id):
return None, "Item returned but '@microsoft.graph.downloadUrl' was missing after 3 attempts."
def download_single_file(app, drive_id, item_id, local_path, expected_size, display_name, remote_hash=None, initial_url=None, remote_mtime_str=None):
def download_single_file(app, drive_id, item_id, local_path, expected_size, display_name, config, stop_event=None, remote_hash=None, initial_url=None, remote_mtime_str=None):
try:
if stop_event and stop_event.is_set():
raise InterruptedError("Sync cancelled")
file_mode = 'wb'
resume_header = {}
existing_size = 0
@@ -260,12 +278,8 @@ def download_single_file(app, drive_id, item_id, local_path, expected_size, disp
# Hvis filen findes, har rigtig størrelse OG lokal er ikke ældre end remote -> SKIP
if existing_size == expected_size:
if local_mtime >= (remote_mtime - 1): # Vi tillader 1 sekuds difference pga. filsystem-præcision
if verify_integrity(local_path, remote_hash):
logger.info(f"Skipped (up-to-date): {display_name}")
return True, None
else:
logger.warning(f"Hash mismatch on existing file: {display_name}. Re-downloading.")
existing_size = 0
logger.info(f"Skipped (up-to-date): {display_name}")
return True, None
else:
logger.info(f"Update available: {display_name} (Remote is newer)")
existing_size = 0
@@ -306,13 +320,15 @@ def download_single_file(app, drive_id, item_id, local_path, expected_size, disp
with open(long_local_path, file_mode) as f:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
if stop_event and stop_event.is_set():
raise InterruptedError("Sync cancelled")
if chunk:
f.write(chunk)
# Post-download check
final_size = os.path.getsize(long_local_path)
if final_size == expected_size:
if verify_integrity(local_path, remote_hash):
if verify_integrity(local_path, remote_hash, config):
logger.info(f"DONE: {display_name}")
return True, None
else:
@@ -320,15 +336,20 @@ def download_single_file(app, drive_id, item_id, local_path, expected_size, disp
else:
return False, f"Size mismatch: Remote={expected_size}, Local={final_size}"
except InterruptedError:
raise
except Exception as e:
return False, str(e)
# --- Main Traversal Logic ---
def process_item_list(app, drive_id, item_path, local_root_path, report, executor, futures, depth=0):
def process_item_list(app, drive_id, item_path, local_root_path, report, executor, futures, config, stop_event=None, depth=0):
if depth >= MAX_FOLDER_DEPTH:
logger.warning(f"Max folder depth ({MAX_FOLDER_DEPTH}) reached at: {item_path}. Skipping subtree.")
return
try:
if stop_event and stop_event.is_set():
raise InterruptedError("Sync cancelled")
encoded_path = quote(item_path)
if not item_path:
@@ -342,12 +363,15 @@ def process_item_list(app, drive_id, item_path, local_root_path, report, executo
items = data.get('value', [])
for item in items:
if stop_event and stop_event.is_set():
raise InterruptedError("Sync cancelled")
item_name = item['name']
local_path = os.path.join(local_root_path, item_name)
display_path = f"{item_path}/{item_name}".strip('/')
if 'folder' in item:
process_item_list(app, drive_id, display_path, local_path, report, executor, futures, depth + 1)
process_item_list(app, drive_id, display_path, local_path, report, executor, futures, config, stop_event, depth + 1)
elif 'file' in item:
item_id = item['id']
download_url = item.get('@microsoft.graph.downloadUrl')
@@ -358,12 +382,14 @@ def process_item_list(app, drive_id, item_path, local_root_path, report, executo
download_single_file,
app, drive_id, item_id,
local_path, item['size'], display_path,
remote_hash, download_url, remote_mtime
config, stop_event, remote_hash, download_url, remote_mtime
)
futures[future] = display_path
url = data.get('@odata.nextLink')
except InterruptedError:
raise
except Exception as e:
logger.error(f"Error traversing {item_path}: {e}")
with report_lock:
@@ -374,9 +400,11 @@ def create_msal_app(tenant_id, client_id, client_secret):
client_id, authority=f"https://login.microsoftonline.com/{tenant_id}", client_credential=client_secret
)
def main():
def main(config=None, stop_event=None):
try:
config = load_config('connection_info.txt')
if config is None:
config = load_config('connection_info.txt')
tenant_id = config.get('TENANT_ID', '')
client_id = config.get('CLIENT_ID', '')
client_secret = config.get('CLIENT_SECRET', '')
@@ -385,17 +413,6 @@ def main():
folders_str = config.get('FOLDERS_TO_DOWNLOAD', '')
local_base = config.get('LOCAL_PATH', '').replace('\\', os.sep)
# Opdater globale indstillinger fra config hvis de findes
global ENABLE_HASH_VALIDATION, HASH_THRESHOLD_MB
if 'ENABLE_HASH_VALIDATION' in config:
# Vi tjekker om strengen er 'true' (case-insensitive)
ENABLE_HASH_VALIDATION = config['ENABLE_HASH_VALIDATION'].lower() == 'true'
if 'HASH_THRESHOLD_MB' in config:
try:
HASH_THRESHOLD_MB = int(config['HASH_THRESHOLD_MB'])
except ValueError:
logger.warning(f"Ugyldig værdi for HASH_THRESHOLD_MB i config: {config['HASH_THRESHOLD_MB']}. Bruger standard: {HASH_THRESHOLD_MB}")
folders = [f.strip() for f in folders_str.split(',') if f.strip()] or [""]
logger.info("Initializing SharePoint Production Sync Tool...")
@@ -407,18 +424,29 @@ def main():
with ThreadPoolExecutor(max_workers=MAX_WORKERS, thread_name_prefix="DL") as executor:
futures = {}
for folder in folders:
if stop_event and stop_event.is_set():
break
logger.info(f"Scanning: {folder or 'Root'}")
process_item_list(app, drive_id, folder, os.path.join(local_base, folder), report, executor, futures)
process_item_list(app, drive_id, folder, os.path.join(local_base, folder), report, executor, futures, config, stop_event)
logger.info(f"Scan complete. Processing {len(futures)} tasks...")
for future in as_completed(futures):
if stop_event and stop_event.is_set():
break
path = futures[future]
success, error = future.result()
if not success:
logger.error(f"FAILED: {path} | {error}")
with report_lock:
report.append({"Path": path, "Error": error, "Timestamp": datetime.now().isoformat()})
try:
success, error = future.result()
if not success:
logger.error(f"FAILED: {path} | {error}")
with report_lock:
report.append({"Path": path, "Error": error, "Timestamp": datetime.now().isoformat()})
except InterruptedError:
continue # The executor will shut down anyway
if stop_event and stop_event.is_set():
logger.warning("Synchronization was stopped by user.")
return
report_file = f"download_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
with open(report_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=["Path", "Error", "Timestamp"])
@@ -427,6 +455,8 @@ def main():
logger.info(f"Sync complete. Errors: {len(report)}. Report: {report_file}")
except InterruptedError:
logger.warning("Synchronization was stopped by user.")
except Exception as e:
logger.critical(f"FATAL ERROR: {e}")