Files
pdf-slim/pdf-slim

253 lines
9.4 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Copyright (C) 2026 SILO GROUP (www.silogroup.org)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""pdf-slim: losslessly shrink a PDF.
Strategy, in order:
1. mutool clean: garbage-collect, deduplicate, deflate, native font subsetting.
2. For any TrueType font file (FontFile2) still over a size threshold (the
"Word embedded the whole font" case), re-subset it with fontTools against
the document's actual character set (glyph IDs retained so CID/Identity-H
instances sharing the file stay valid), splice it back, recompress.
3. Verify: render every page of the original and the candidate at the same
DPI and require byte-identical pixels. A candidate that fails is discarded.
The smallest verified candidate wins. The input file is never modified.
Usage: pdf-slim input.pdf [output.pdf] [--dpi 200] [--min-font-bytes 150000]
Requires: mutool, qpdf, fix-qdf, pyftsubset (fonttools), python3.
"""
import argparse
import hashlib
import re
import shutil
import subprocess
import sys
import tempfile
import zlib
from pathlib import Path
def run(cmd, **kw):
return subprocess.run(cmd, capture_output=True, **kw)
def need(tool):
if shutil.which(tool) is None:
sys.exit(f"pdf-slim: required tool '{tool}' not found in PATH")
def render_hashes(pdf, dpi, tmp, tag):
"""Render every page to PPM and return list of md5 digests."""
out = tmp / f"{tag}-%d.ppm"
r = run(["mutool", "draw", "-r", str(dpi), "-o", str(out), str(pdf)])
if r.returncode != 0:
return None
hashes = []
for p in sorted(tmp.glob(f"{tag}-*.ppm"),
key=lambda p: int(p.stem.split("-")[-1])):
hashes.append(hashlib.md5(p.read_bytes()).hexdigest())
p.unlink()
return hashes
def recompress(src, dst):
r = run(["qpdf", "--object-streams=generate", "--compress-streams=y",
"--recompress-flate", "--compression-level=9", str(src), str(dst)])
return r.returncode == 0 and dst.exists()
def doc_unicodes(pdf, tmp):
"""Extract the document's text and return a padded set of codepoints."""
txt = tmp / "text.txt"
r = run(["mutool", "draw", "-F", "text", "-o", str(txt), str(pdf)])
cps = set()
if r.returncode == 0 and txt.exists():
text = txt.read_text(encoding="utf-8", errors="replace")
cps = {ord(c) for c in text if ord(c) >= 0x20}
# safety padding: printable ASCII + punctuation Word commonly substitutes
cps |= set(range(0x20, 0x7F))
cps |= {0x2013, 0x2014, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2026,
0x00A0, 0x00AE, 0x00A9, 0x2122}
return sorted(cps)
def resubset_fonts(src, dst, tmp, min_bytes, cps):
"""Find oversized FontFile2 streams in a QDF expansion of src, re-subset
them with pyftsubset (--retain-gids), splice back, fix lengths, recompress
into dst. Returns True if at least one font was shrunk."""
qdf = tmp / "expand.pdf"
r = run(["qpdf", "--qdf", "--object-streams=disable", str(src), str(qdf)])
if r.returncode != 0:
return False
data = qdf.read_bytes()
# objects referenced as /FontFile2 N 0 R
ff2 = set(re.findall(rb"/FontFile2\s+(\d+)\s+0\s+R", data))
if not ff2:
return False
uni = tmp / "unicodes.txt"
uni.write_text(",".join(f"U+{c:04X}" for c in cps))
changed = False
for objnum in ff2:
m = re.search(
rb"(?<!\d)" + objnum +
rb" 0 obj\s*<<(.*?/Length1\s+(\d+).*?)>>\s*stream\r?\n",
data, re.S)
if not m:
continue
length1 = int(m.group(2))
if length1 < min_bytes:
continue
start = m.end()
# find endstream at/after the declared stream length, never inside
# the binary font data (which can contain the literal bytes)
end = data.find(b"endstream", start + length1) \
if not b"FlateDecode" in m.group(1) \
else data.find(b"endstream", start)
if end < 0:
continue
raw = data[start:end]
# QDF streams are normally uncompressed; tolerate flate just in case
if b"FlateDecode" in m.group(1):
try:
raw = zlib.decompress(raw)
except zlib.error:
continue
raw = raw[:length1]
if raw[:4] not in (b"\x00\x01\x00\x00", b"true", b"ttcf"):
continue # not TrueType; out of scope
big = tmp / f"font-{objnum.decode()}.ttf"
small = tmp / f"font-{objnum.decode()}-sub.ttf"
big.write_bytes(raw)
r = run(["pyftsubset", str(big), f"--unicodes-file={uni}",
"--retain-gids", "--notdef-outline",
f"--output-file={small}"])
if r.returncode != 0 or not small.exists():
continue
sub = small.read_bytes()
if len(sub) >= length1:
continue # no win
header = data[m.start():m.end()]
new_header = re.sub(rb"/Length1\s+\d+",
b"/Length1 %d" % len(sub), header)
# stream must be stored uncompressed for fix-qdf to recount /Length;
# strip any filter entry for this stream
new_header = re.sub(rb"/Filter\s*/FlateDecode", b"", new_header)
# the newline before 'endstream' is part of QDF's line conventions
# that fix-qdf depends on -- it must be restored after the splice
data = data[:m.start()] + new_header + sub + b"\n" + data[end:]
changed = True
if not changed:
return False
spliced = tmp / "spliced.pdf"
spliced.write_bytes(data)
fixed = tmp / "fixed.pdf"
with open(fixed, "wb") as fh:
r = subprocess.run(["fix-qdf", str(spliced)], stdout=fh)
if r.returncode != 0:
return False
return recompress(fixed, dst)
def main():
ap = argparse.ArgumentParser(prog="pdf-slim")
ap.add_argument("input")
ap.add_argument("output", nargs="?")
ap.add_argument("--dpi", type=int, default=200,
help="verification render resolution (default 200)")
ap.add_argument("--min-font-bytes", type=int, default=150000,
help="re-subset embedded TrueType fonts larger than this")
args = ap.parse_args()
for tool in ("mutool", "qpdf", "fix-qdf", "pyftsubset"):
need(tool)
inp = Path(args.input).expanduser()
if not inp.is_file():
sys.exit(f"pdf-slim: no such file: {inp}")
out = Path(args.output).expanduser() if args.output \
else inp.with_name(inp.stem + "-slim.pdf")
if out.resolve() == inp.resolve():
sys.exit("pdf-slim: refusing to overwrite the input file")
orig_size = inp.stat().st_size
with tempfile.TemporaryDirectory(prefix="pdf-slim-") as td:
tmp = Path(td)
print(f"input: {inp} ({orig_size:,} bytes)")
baseline = render_hashes(inp, args.dpi, tmp, "orig")
if not baseline:
sys.exit("pdf-slim: could not render input for verification")
candidates = []
# candidate A: qpdf structural only (always render-safe, small win)
a = tmp / "a.pdf"
if recompress(inp, a):
candidates.append(("structural", a))
# candidate B: mutool clean with native subsetting
b_raw = tmp / "b-raw.pdf"
b = tmp / "b.pdf"
r = run(["mutool", "clean", "-gggg", "-z", "-f", "-i", "-c", "-S",
"-Z", str(inp), str(b_raw)])
if r.returncode == 0 and b_raw.exists() and recompress(b_raw, b):
candidates.append(("mutool-subset", b))
# candidate C: B plus manual re-subset of oversized TrueType fonts
c = tmp / "c.pdf"
base_for_c = b if b.exists() else inp
cps = doc_unicodes(inp, tmp)
if resubset_fonts(base_for_c, c, tmp, args.min_font_bytes, cps):
candidates.append(("font-resubset", c))
# pick the smallest candidate that renders identically
best = None
for name, path in sorted(candidates,
key=lambda t: t[1].stat().st_size):
hashes = render_hashes(path, args.dpi, tmp, "cand")
if hashes == baseline:
best = (name, path)
break
print(f" rejected {name}: rendering differs")
# idempotency: if no candidate wins meaningfully (>1% smaller),
# emit the input unchanged so repeated runs are stable
if best is None or best[1].stat().st_size >= orig_size * 0.99:
shutil.copyfile(inp, out)
print("already minimal; output is an unchanged copy of the input")
print(f"output: {out} ({orig_size:,} bytes, -0.0%)")
return
shutil.copyfile(best[1], out)
new_size = out.stat().st_size
pct = 100 * (orig_size - new_size) / orig_size
print(f"method: {best[0]} (pixel-identical at {args.dpi} dpi)")
print(f"output: {out} ({new_size:,} bytes, -{pct:.1f}%)")
if __name__ == "__main__":
main()