diff --git a/download_sharepoint.py b/download_sharepoint.py index 172a235..15245da 100644 --- a/download_sharepoint.py +++ b/download_sharepoint.py @@ -12,6 +12,13 @@ from urllib.parse import urlparse, quote MAX_WORKERS = 5 report_lock = threading.Lock() +def format_size(size_bytes): + """Formats bytes into a human-readable string.""" + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if size_bytes < 1024.0: + return f"{size_bytes:.2f} {unit}" + size_bytes /= 1024.0 + def load_config(file_path): config = {} with open(file_path, 'r', encoding='utf-8') as f: @@ -68,9 +75,11 @@ def download_single_file(download_url, local_path, expected_size, display_name): print(f"Skipped (matches local): {display_name}") return True, None + print(f"Starting: {display_name} ({format_size(expected_size)})") os.makedirs(os.path.dirname(local_path), exist_ok=True) - # Using a timeout for the request to prevent hanging indefinitely - response = requests.get(download_url, stream=True, timeout=60) + + # Using a longer timeout for the initial connection on very large files + response = requests.get(download_url, stream=True, timeout=120) response.raise_for_status() with open(local_path, 'wb') as f: @@ -81,7 +90,7 @@ def download_single_file(download_url, local_path, expected_size, display_name): # Verify size after download local_size = os.path.getsize(local_path) if int(local_size) == int(expected_size): - print(f"Downloaded: {display_name}") + print(f"DONE: {display_name}") return True, None else: return False, f"Size mismatch: Remote={expected_size}, Local={local_size}" @@ -89,36 +98,46 @@ def download_single_file(download_url, local_path, expected_size, display_name): return False, str(e) def process_item_list(app, drive_id, item_path, local_root_path, report, executor, futures): - """Traverses folders and submits file downloads to the executor.""" + """Traverses folders and submits file downloads to the executor with pagination support.""" try: headers = get_headers(app) encoded_path = quote(item_path) - url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{encoded_path}:/children" + # Initial URL for the folder children if not item_path: url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root/children" + else: + url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{encoded_path}:/children" - response = requests.get(url, headers=headers) - response.raise_for_status() - items = response.json().get('value', []) - - for item in items: - item_name = item['name'] - local_path = os.path.join(local_root_path, item_name) - display_path = f"{item_path}/{item_name}".strip('/') + while url: + response = requests.get(url, headers=headers) + response.raise_for_status() + data = response.json() + items = data.get('value', []) - if 'folder' in item: - process_item_list(app, drive_id, display_path, local_path, report, executor, futures) - elif 'file' in item: - download_url = item.get('@microsoft.graph.downloadUrl') - if not download_url: - with report_lock: - report.append({"Path": display_path, "Error": "No download URL", "Timestamp": datetime.now().isoformat()}) - continue + for item in items: + item_name = item['name'] + local_path = os.path.join(local_root_path, item_name) + display_path = f"{item_path}/{item_name}".strip('/') - # Submit download to thread pool - future = executor.submit(download_single_file, download_url, local_path, item['size'], display_path) - futures[future] = display_path + if 'folder' in item: + process_item_list(app, drive_id, display_path, local_path, report, executor, futures) + elif 'file' in item: + download_url = item.get('@microsoft.graph.downloadUrl') + if not download_url: + with report_lock: + report.append({"Path": display_path, "Error": "No download URL", "Timestamp": datetime.now().isoformat()}) + continue + + # Submit download to thread pool + future = executor.submit(download_single_file, download_url, local_path, item['size'], display_path) + futures[future] = display_path + + # Check for next page of items + url = data.get('@odata.nextLink') + if url: + # Refresh token if needed for the next page request + headers = get_headers(app) except Exception as e: with report_lock: @@ -126,11 +145,11 @@ def process_item_list(app, drive_id, item_path, local_root_path, report, executo def main(): config = load_config('connection_info.txt') - tenant_id = config.get('TENANT_ID') - client_id = config.get('CLIENT_ID') - client_secret = config.get('CLIENT_SECRET') - site_url = config.get('SITE_URL') - drive_name = config.get('DOCUMENT_LIBRARY') + tenant_id = config.get('TENANT_ID', '') + client_id = config.get('CLIENT_ID', '') + client_secret = config.get('CLIENT_SECRET', '') + site_url = config.get('SITE_URL', '') + drive_name = config.get('DOCUMENT_LIBRARY', '') folders_to_download_str = config.get('FOLDERS_TO_DOWNLOAD', '') local_path_base = config.get('LOCAL_PATH', '').replace('\\', os.sep) @@ -151,13 +170,15 @@ def main(): futures = {} for folder in folders_to_download: if folder == "": - print("\nProcessing entire document library (Root)...") + print("\nScanning entire document library (Root)...") else: - print(f"\nProcessing folder: {folder}") + print(f"\nScanning folder: {folder}") local_folder_path = os.path.join(local_path_base, folder) process_item_list(app, drive_id, folder, local_folder_path, report, executor, futures) + print(f"\n--- Scanning complete. Active downloads: {len(futures)} ---\n") + # Wait for all downloads to complete and collect errors for future in as_completed(futures): path = futures[future] @@ -177,7 +198,8 @@ def main(): writer.writeheader() writer.writerows(report) - print(f"\nProcess complete. Errors: {len(report)}") + print(f"\nProcess complete. Errors logged: {len(report)}") + print(f"Report file: {report_file}") if __name__ == "__main__": main()