pdf-slim/pdf-slim

#!/usr/bin/env python3
#
# Copyright (C) 2026 SILO GROUP (www.silogroup.org)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""pdf-slim: losslessly shrink a PDF.

Strategy, in order:
  1. mutool clean: garbage-collect, deduplicate, deflate, native font subsetting.
  2. For any TrueType font file (FontFile2) still over a size threshold (the
     "Word embedded the whole font" case), re-subset it with fontTools against
     the document's actual character set (glyph IDs retained so CID/Identity-H
     instances sharing the file stay valid), splice it back, recompress.
  3. Verify: render every page of the original and the candidate at the same
     DPI and require byte-identical pixels. A candidate that fails is discarded.

The smallest verified candidate wins. The input file is never modified.

Usage: pdf-slim input.pdf [output.pdf] [--dpi 200] [--min-font-bytes 150000]

Requires: mutool, qpdf, fix-qdf, pyftsubset (fonttools), python3.
"""

import argparse
import hashlib
import re
import shutil
import subprocess
import sys
import tempfile
import zlib
from pathlib import Path


def run(cmd, **kw):
    return subprocess.run(cmd, capture_output=True, **kw)


def need(tool):
    if shutil.which(tool) is None:
        sys.exit(f"pdf-slim: required tool '{tool}' not found in PATH")


def render_hashes(pdf, dpi, tmp, tag):
    """Render every page to PPM and return list of md5 digests."""
    out = tmp / f"{tag}-%d.ppm"
    r = run(["mutool", "draw", "-r", str(dpi), "-o", str(out), str(pdf)])
    if r.returncode != 0:
        return None
    hashes = []
    for p in sorted(tmp.glob(f"{tag}-*.ppm"),
                    key=lambda p: int(p.stem.split("-")[-1])):
        hashes.append(hashlib.md5(p.read_bytes()).hexdigest())
        p.unlink()
    return hashes


def recompress(src, dst):
    r = run(["qpdf", "--object-streams=generate", "--compress-streams=y",
             "--recompress-flate", "--compression-level=9", str(src), str(dst)])
    return r.returncode == 0 and dst.exists()


def doc_unicodes(pdf, tmp):
    """Extract the document's text and return a padded set of codepoints."""
    txt = tmp / "text.txt"
    r = run(["mutool", "draw", "-F", "text", "-o", str(txt), str(pdf)])
    cps = set()
    if r.returncode == 0 and txt.exists():
        text = txt.read_text(encoding="utf-8", errors="replace")
        cps = {ord(c) for c in text if ord(c) >= 0x20}
    # safety padding: printable ASCII + punctuation Word commonly substitutes
    cps |= set(range(0x20, 0x7F))
    cps |= {0x2013, 0x2014, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2026,
            0x00A0, 0x00AE, 0x00A9, 0x2122}
    return sorted(cps)


def resubset_fonts(src, dst, tmp, min_bytes, cps):
    """Find oversized FontFile2 streams in a QDF expansion of src, re-subset
    them with pyftsubset (--retain-gids), splice back, fix lengths, recompress
    into dst. Returns True if at least one font was shrunk."""
    qdf = tmp / "expand.pdf"
    r = run(["qpdf", "--qdf", "--object-streams=disable", str(src), str(qdf)])
    if r.returncode != 0:
        return False
    data = qdf.read_bytes()

    # objects referenced as /FontFile2 N 0 R
    ff2 = set(re.findall(rb"/FontFile2\s+(\d+)\s+0\s+R", data))
    if not ff2:
        return False

    uni = tmp / "unicodes.txt"
    uni.write_text(",".join(f"U+{c:04X}" for c in cps))

    changed = False
    for objnum in ff2:
        m = re.search(
            rb"(?<!\d)" + objnum +
            rb" 0 obj\s*<<(.*?/Length1\s+(\d+).*?)>>\s*stream\r?\n",
            data, re.S)
        if not m:
            continue
        length1 = int(m.group(2))
        if length1 < min_bytes:
            continue
        start = m.end()
        # find endstream at/after the declared stream length, never inside
        # the binary font data (which can contain the literal bytes)
        end = data.find(b"endstream", start + length1) \
            if not b"FlateDecode" in m.group(1) \
            else data.find(b"endstream", start)
        if end < 0:
            continue
        raw = data[start:end]
        # QDF streams are normally uncompressed; tolerate flate just in case
        if b"FlateDecode" in m.group(1):
            try:
                raw = zlib.decompress(raw)
            except zlib.error:
                continue
        raw = raw[:length1]
        if raw[:4] not in (b"\x00\x01\x00\x00", b"true", b"ttcf"):
            continue  # not TrueType; out of scope

        big = tmp / f"font-{objnum.decode()}.ttf"
        small = tmp / f"font-{objnum.decode()}-sub.ttf"
        big.write_bytes(raw)
        r = run(["pyftsubset", str(big), f"--unicodes-file={uni}",
                 "--retain-gids", "--notdef-outline",
                 f"--output-file={small}"])
        if r.returncode != 0 or not small.exists():
            continue
        sub = small.read_bytes()
        if len(sub) >= length1:
            continue  # no win

        header = data[m.start():m.end()]
        new_header = re.sub(rb"/Length1\s+\d+",
                            b"/Length1 %d" % len(sub), header)
        # stream must be stored uncompressed for fix-qdf to recount /Length;
        # strip any filter entry for this stream
        new_header = re.sub(rb"/Filter\s*/FlateDecode", b"", new_header)
        # the newline before 'endstream' is part of QDF's line conventions
        # that fix-qdf depends on -- it must be restored after the splice
        data = data[:m.start()] + new_header + sub + b"\n" + data[end:]
        changed = True

    if not changed:
        return False

    spliced = tmp / "spliced.pdf"
    spliced.write_bytes(data)
    fixed = tmp / "fixed.pdf"
    with open(fixed, "wb") as fh:
        r = subprocess.run(["fix-qdf", str(spliced)], stdout=fh)
    if r.returncode != 0:
        return False
    return recompress(fixed, dst)


def main():
    ap = argparse.ArgumentParser(prog="pdf-slim")
    ap.add_argument("input")
    ap.add_argument("output", nargs="?")
    ap.add_argument("--dpi", type=int, default=200,
                    help="verification render resolution (default 200)")
    ap.add_argument("--min-font-bytes", type=int, default=150000,
                    help="re-subset embedded TrueType fonts larger than this")
    args = ap.parse_args()

    for tool in ("mutool", "qpdf", "fix-qdf", "pyftsubset"):
        need(tool)

    inp = Path(args.input).expanduser()
    if not inp.is_file():
        sys.exit(f"pdf-slim: no such file: {inp}")
    out = Path(args.output).expanduser() if args.output \
        else inp.with_name(inp.stem + "-slim.pdf")
    if out.resolve() == inp.resolve():
        sys.exit("pdf-slim: refusing to overwrite the input file")

    orig_size = inp.stat().st_size

    with tempfile.TemporaryDirectory(prefix="pdf-slim-") as td:
        tmp = Path(td)
        print(f"input:  {inp}  ({orig_size:,} bytes)")
        baseline = render_hashes(inp, args.dpi, tmp, "orig")
        if not baseline:
            sys.exit("pdf-slim: could not render input for verification")

        candidates = []

        # candidate A: qpdf structural only (always render-safe, small win)
        a = tmp / "a.pdf"
        if recompress(inp, a):
            candidates.append(("structural", a))

        # candidate B: mutool clean with native subsetting
        b_raw = tmp / "b-raw.pdf"
        b = tmp / "b.pdf"
        r = run(["mutool", "clean", "-gggg", "-z", "-f", "-i", "-c", "-S",
                 "-Z", str(inp), str(b_raw)])
        if r.returncode == 0 and b_raw.exists() and recompress(b_raw, b):
            candidates.append(("mutool-subset", b))

        # candidate C: B plus manual re-subset of oversized TrueType fonts
        c = tmp / "c.pdf"
        base_for_c = b if b.exists() else inp
        cps = doc_unicodes(inp, tmp)
        if resubset_fonts(base_for_c, c, tmp, args.min_font_bytes, cps):
            candidates.append(("font-resubset", c))

        # pick the smallest candidate that renders identically
        best = None
        for name, path in sorted(candidates,
                                 key=lambda t: t[1].stat().st_size):
            hashes = render_hashes(path, args.dpi, tmp, "cand")
            if hashes == baseline:
                best = (name, path)
                break
            print(f"  rejected {name}: rendering differs")

        # idempotency: if no candidate wins meaningfully (>1% smaller),
        # emit the input unchanged so repeated runs are stable
        if best is None or best[1].stat().st_size >= orig_size * 0.99:
            shutil.copyfile(inp, out)
            print("already minimal; output is an unchanged copy of the input")
            print(f"output: {out}  ({orig_size:,} bytes, -0.0%)")
            return

        shutil.copyfile(best[1], out)
        new_size = out.stat().st_size
        pct = 100 * (orig_size - new_size) / orig_size
        print(f"method: {best[0]} (pixel-identical at {args.dpi} dpi)")
        print(f"output: {out}  ({new_size:,} bytes, -{pct:.1f}%)")


if __name__ == "__main__":
    main()