import os import json import requests JSON_PATH = "catalog-export-580103959.json" # path to the JSON for 580103959 OUT_DIR = "nsa_uap_pdfs" os.makedirs(OUT_DIR, exist_ok=True) with open(JSON_PATH, "r", encoding="utf-8") as f: data = json.load(f) seen = set() def iter_objects(obj): if isinstance(obj, dict): yield obj for v in obj.values(): yield from iter_objects(v) elif isinstance(obj, list): for v in obj: yield from iter_objects(v) pdf_urls = [] for o in iter_objects(data): for key in ("file", "objects", "online", "url", "href"): if key in o: val = o[key] if isinstance(val, str) and val.lower().endswith(".pdf"): pdf_urls.append(val) elif isinstance(val, list): for v in val: if isinstance(v, str) and v.lower().endswith(".pdf"): pdf_urls.append(v) final_urls = [] for u in pdf_urls: if u not in seen: seen.add(u) final_urls.append(u) print(f"Found {len(final_urls)} PDF URLs") session = requests.Session() for url in final_urls: if url.startswith("//"): url = "https:" + url elif url.startswith("/"): url = "https://catalog.archives.gov" + url fname = url.split("/")[-1].split("?")[0] out_path = os.path.join(OUT_DIR, fname) if os.path.exists(out_path): print(f"Skipping existing {fname}") continue print(f"Downloading {url} -> {fname}") resp = session.get(url, stream=True) resp.raise_for_status() with open(out_path, "wb") as f: for chunk in resp.iter_content(chunk_size=8192): if chunk: f.write(chunk)