From f01b05c1c524dd5906195f98808d7bd5bcb39929 Mon Sep 17 00:00:00 2001 From: "Christopher M. Punches" Date: Mon, 8 Jun 2026 10:55:05 -0400 Subject: [PATCH] Add pdf-slim --- pdf-slim | 237 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100755 pdf-slim diff --git a/pdf-slim b/pdf-slim new file mode 100755 index 0000000..b73217f --- /dev/null +++ b/pdf-slim @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +"""pdf-slim: losslessly shrink a PDF. + +Strategy, in order: + 1. mutool clean: garbage-collect, deduplicate, deflate, native font subsetting. + 2. For any TrueType font file (FontFile2) still over a size threshold (the + "Word embedded the whole font" case), re-subset it with fontTools against + the document's actual character set (glyph IDs retained so CID/Identity-H + instances sharing the file stay valid), splice it back, recompress. + 3. Verify: render every page of the original and the candidate at the same + DPI and require byte-identical pixels. A candidate that fails is discarded. + +The smallest verified candidate wins. The input file is never modified. + +Usage: pdf-slim input.pdf [output.pdf] [--dpi 200] [--min-font-bytes 150000] + +Requires: mutool, qpdf, fix-qdf, pyftsubset (fonttools), python3. +""" + +import argparse +import hashlib +import re +import shutil +import subprocess +import sys +import tempfile +import zlib +from pathlib import Path + + +def run(cmd, **kw): + return subprocess.run(cmd, capture_output=True, **kw) + + +def need(tool): + if shutil.which(tool) is None: + sys.exit(f"pdf-slim: required tool '{tool}' not found in PATH") + + +def render_hashes(pdf, dpi, tmp, tag): + """Render every page to PPM and return list of md5 digests.""" + out = tmp / f"{tag}-%d.ppm" + r = run(["mutool", "draw", "-r", str(dpi), "-o", str(out), str(pdf)]) + if r.returncode != 0: + return None + hashes = [] + for p in sorted(tmp.glob(f"{tag}-*.ppm"), + key=lambda p: int(p.stem.split("-")[-1])): + hashes.append(hashlib.md5(p.read_bytes()).hexdigest()) + p.unlink() + return hashes + + +def recompress(src, dst): + r = run(["qpdf", "--object-streams=generate", "--compress-streams=y", + "--recompress-flate", "--compression-level=9", str(src), str(dst)]) + return r.returncode == 0 and dst.exists() + + +def doc_unicodes(pdf, tmp): + """Extract the document's text and return a padded set of codepoints.""" + txt = tmp / "text.txt" + r = run(["mutool", "draw", "-F", "text", "-o", str(txt), str(pdf)]) + cps = set() + if r.returncode == 0 and txt.exists(): + text = txt.read_text(encoding="utf-8", errors="replace") + cps = {ord(c) for c in text if ord(c) >= 0x20} + # safety padding: printable ASCII + punctuation Word commonly substitutes + cps |= set(range(0x20, 0x7F)) + cps |= {0x2013, 0x2014, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2026, + 0x00A0, 0x00AE, 0x00A9, 0x2122} + return sorted(cps) + + +def resubset_fonts(src, dst, tmp, min_bytes, cps): + """Find oversized FontFile2 streams in a QDF expansion of src, re-subset + them with pyftsubset (--retain-gids), splice back, fix lengths, recompress + into dst. Returns True if at least one font was shrunk.""" + qdf = tmp / "expand.pdf" + r = run(["qpdf", "--qdf", "--object-streams=disable", str(src), str(qdf)]) + if r.returncode != 0: + return False + data = qdf.read_bytes() + + # objects referenced as /FontFile2 N 0 R + ff2 = set(re.findall(rb"/FontFile2\s+(\d+)\s+0\s+R", data)) + if not ff2: + return False + + uni = tmp / "unicodes.txt" + uni.write_text(",".join(f"U+{c:04X}" for c in cps)) + + changed = False + for objnum in ff2: + m = re.search( + rb"(?>\s*stream\r?\n", + data, re.S) + if not m: + continue + length1 = int(m.group(2)) + if length1 < min_bytes: + continue + start = m.end() + # find endstream at/after the declared stream length, never inside + # the binary font data (which can contain the literal bytes) + end = data.find(b"endstream", start + length1) \ + if not b"FlateDecode" in m.group(1) \ + else data.find(b"endstream", start) + if end < 0: + continue + raw = data[start:end] + # QDF streams are normally uncompressed; tolerate flate just in case + if b"FlateDecode" in m.group(1): + try: + raw = zlib.decompress(raw) + except zlib.error: + continue + raw = raw[:length1] + if raw[:4] not in (b"\x00\x01\x00\x00", b"true", b"ttcf"): + continue # not TrueType; out of scope + + big = tmp / f"font-{objnum.decode()}.ttf" + small = tmp / f"font-{objnum.decode()}-sub.ttf" + big.write_bytes(raw) + r = run(["pyftsubset", str(big), f"--unicodes-file={uni}", + "--retain-gids", "--notdef-outline", + f"--output-file={small}"]) + if r.returncode != 0 or not small.exists(): + continue + sub = small.read_bytes() + if len(sub) >= length1: + continue # no win + + header = data[m.start():m.end()] + new_header = re.sub(rb"/Length1\s+\d+", + b"/Length1 %d" % len(sub), header) + # stream must be stored uncompressed for fix-qdf to recount /Length; + # strip any filter entry for this stream + new_header = re.sub(rb"/Filter\s*/FlateDecode", b"", new_header) + # the newline before 'endstream' is part of QDF's line conventions + # that fix-qdf depends on -- it must be restored after the splice + data = data[:m.start()] + new_header + sub + b"\n" + data[end:] + changed = True + + if not changed: + return False + + spliced = tmp / "spliced.pdf" + spliced.write_bytes(data) + fixed = tmp / "fixed.pdf" + with open(fixed, "wb") as fh: + r = subprocess.run(["fix-qdf", str(spliced)], stdout=fh) + if r.returncode != 0: + return False + return recompress(fixed, dst) + + +def main(): + ap = argparse.ArgumentParser(prog="pdf-slim") + ap.add_argument("input") + ap.add_argument("output", nargs="?") + ap.add_argument("--dpi", type=int, default=200, + help="verification render resolution (default 200)") + ap.add_argument("--min-font-bytes", type=int, default=150000, + help="re-subset embedded TrueType fonts larger than this") + args = ap.parse_args() + + for tool in ("mutool", "qpdf", "fix-qdf", "pyftsubset"): + need(tool) + + inp = Path(args.input).expanduser() + if not inp.is_file(): + sys.exit(f"pdf-slim: no such file: {inp}") + out = Path(args.output).expanduser() if args.output \ + else inp.with_name(inp.stem + "-slim.pdf") + if out.resolve() == inp.resolve(): + sys.exit("pdf-slim: refusing to overwrite the input file") + + orig_size = inp.stat().st_size + + with tempfile.TemporaryDirectory(prefix="pdf-slim-") as td: + tmp = Path(td) + print(f"input: {inp} ({orig_size:,} bytes)") + baseline = render_hashes(inp, args.dpi, tmp, "orig") + if not baseline: + sys.exit("pdf-slim: could not render input for verification") + + candidates = [] + + # candidate A: qpdf structural only (always render-safe, small win) + a = tmp / "a.pdf" + if recompress(inp, a): + candidates.append(("structural", a)) + + # candidate B: mutool clean with native subsetting + b_raw = tmp / "b-raw.pdf" + b = tmp / "b.pdf" + r = run(["mutool", "clean", "-gggg", "-z", "-f", "-i", "-c", "-S", + "-Z", str(inp), str(b_raw)]) + if r.returncode == 0 and b_raw.exists() and recompress(b_raw, b): + candidates.append(("mutool-subset", b)) + + # candidate C: B plus manual re-subset of oversized TrueType fonts + c = tmp / "c.pdf" + base_for_c = b if b.exists() else inp + cps = doc_unicodes(inp, tmp) + if resubset_fonts(base_for_c, c, tmp, args.min_font_bytes, cps): + candidates.append(("font-resubset", c)) + + # pick the smallest candidate that renders identically + best = None + for name, path in sorted(candidates, + key=lambda t: t[1].stat().st_size): + hashes = render_hashes(path, args.dpi, tmp, "cand") + if hashes == baseline: + best = (name, path) + break + print(f" rejected {name}: rendering differs") + + # idempotency: if no candidate wins meaningfully (>1% smaller), + # emit the input unchanged so repeated runs are stable + if best is None or best[1].stat().st_size >= orig_size * 0.99: + shutil.copyfile(inp, out) + print("already minimal; output is an unchanged copy of the input") + print(f"output: {out} ({orig_size:,} bytes, -0.0%)") + return + + shutil.copyfile(best[1], out) + new_size = out.stat().st_size + pct = 100 * (orig_size - new_size) / orig_size + print(f"method: {best[0]} (pixel-identical at {args.dpi} dpi)") + print(f"output: {out} ({new_size:,} bytes, -{pct:.1f}%)") + + +if __name__ == "__main__": + main()