Add pdf-slim
This commit is contained in:
237
pdf-slim
Executable file
237
pdf-slim
Executable file
@@ -0,0 +1,237 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""pdf-slim: losslessly shrink a PDF.
|
||||||
|
|
||||||
|
Strategy, in order:
|
||||||
|
1. mutool clean: garbage-collect, deduplicate, deflate, native font subsetting.
|
||||||
|
2. For any TrueType font file (FontFile2) still over a size threshold (the
|
||||||
|
"Word embedded the whole font" case), re-subset it with fontTools against
|
||||||
|
the document's actual character set (glyph IDs retained so CID/Identity-H
|
||||||
|
instances sharing the file stay valid), splice it back, recompress.
|
||||||
|
3. Verify: render every page of the original and the candidate at the same
|
||||||
|
DPI and require byte-identical pixels. A candidate that fails is discarded.
|
||||||
|
|
||||||
|
The smallest verified candidate wins. The input file is never modified.
|
||||||
|
|
||||||
|
Usage: pdf-slim input.pdf [output.pdf] [--dpi 200] [--min-font-bytes 150000]
|
||||||
|
|
||||||
|
Requires: mutool, qpdf, fix-qdf, pyftsubset (fonttools), python3.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import zlib
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def run(cmd, **kw):
|
||||||
|
return subprocess.run(cmd, capture_output=True, **kw)
|
||||||
|
|
||||||
|
|
||||||
|
def need(tool):
|
||||||
|
if shutil.which(tool) is None:
|
||||||
|
sys.exit(f"pdf-slim: required tool '{tool}' not found in PATH")
|
||||||
|
|
||||||
|
|
||||||
|
def render_hashes(pdf, dpi, tmp, tag):
|
||||||
|
"""Render every page to PPM and return list of md5 digests."""
|
||||||
|
out = tmp / f"{tag}-%d.ppm"
|
||||||
|
r = run(["mutool", "draw", "-r", str(dpi), "-o", str(out), str(pdf)])
|
||||||
|
if r.returncode != 0:
|
||||||
|
return None
|
||||||
|
hashes = []
|
||||||
|
for p in sorted(tmp.glob(f"{tag}-*.ppm"),
|
||||||
|
key=lambda p: int(p.stem.split("-")[-1])):
|
||||||
|
hashes.append(hashlib.md5(p.read_bytes()).hexdigest())
|
||||||
|
p.unlink()
|
||||||
|
return hashes
|
||||||
|
|
||||||
|
|
||||||
|
def recompress(src, dst):
|
||||||
|
r = run(["qpdf", "--object-streams=generate", "--compress-streams=y",
|
||||||
|
"--recompress-flate", "--compression-level=9", str(src), str(dst)])
|
||||||
|
return r.returncode == 0 and dst.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def doc_unicodes(pdf, tmp):
|
||||||
|
"""Extract the document's text and return a padded set of codepoints."""
|
||||||
|
txt = tmp / "text.txt"
|
||||||
|
r = run(["mutool", "draw", "-F", "text", "-o", str(txt), str(pdf)])
|
||||||
|
cps = set()
|
||||||
|
if r.returncode == 0 and txt.exists():
|
||||||
|
text = txt.read_text(encoding="utf-8", errors="replace")
|
||||||
|
cps = {ord(c) for c in text if ord(c) >= 0x20}
|
||||||
|
# safety padding: printable ASCII + punctuation Word commonly substitutes
|
||||||
|
cps |= set(range(0x20, 0x7F))
|
||||||
|
cps |= {0x2013, 0x2014, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2026,
|
||||||
|
0x00A0, 0x00AE, 0x00A9, 0x2122}
|
||||||
|
return sorted(cps)
|
||||||
|
|
||||||
|
|
||||||
|
def resubset_fonts(src, dst, tmp, min_bytes, cps):
|
||||||
|
"""Find oversized FontFile2 streams in a QDF expansion of src, re-subset
|
||||||
|
them with pyftsubset (--retain-gids), splice back, fix lengths, recompress
|
||||||
|
into dst. Returns True if at least one font was shrunk."""
|
||||||
|
qdf = tmp / "expand.pdf"
|
||||||
|
r = run(["qpdf", "--qdf", "--object-streams=disable", str(src), str(qdf)])
|
||||||
|
if r.returncode != 0:
|
||||||
|
return False
|
||||||
|
data = qdf.read_bytes()
|
||||||
|
|
||||||
|
# objects referenced as /FontFile2 N 0 R
|
||||||
|
ff2 = set(re.findall(rb"/FontFile2\s+(\d+)\s+0\s+R", data))
|
||||||
|
if not ff2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
uni = tmp / "unicodes.txt"
|
||||||
|
uni.write_text(",".join(f"U+{c:04X}" for c in cps))
|
||||||
|
|
||||||
|
changed = False
|
||||||
|
for objnum in ff2:
|
||||||
|
m = re.search(
|
||||||
|
rb"(?<!\d)" + objnum +
|
||||||
|
rb" 0 obj\s*<<(.*?/Length1\s+(\d+).*?)>>\s*stream\r?\n",
|
||||||
|
data, re.S)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
length1 = int(m.group(2))
|
||||||
|
if length1 < min_bytes:
|
||||||
|
continue
|
||||||
|
start = m.end()
|
||||||
|
# find endstream at/after the declared stream length, never inside
|
||||||
|
# the binary font data (which can contain the literal bytes)
|
||||||
|
end = data.find(b"endstream", start + length1) \
|
||||||
|
if not b"FlateDecode" in m.group(1) \
|
||||||
|
else data.find(b"endstream", start)
|
||||||
|
if end < 0:
|
||||||
|
continue
|
||||||
|
raw = data[start:end]
|
||||||
|
# QDF streams are normally uncompressed; tolerate flate just in case
|
||||||
|
if b"FlateDecode" in m.group(1):
|
||||||
|
try:
|
||||||
|
raw = zlib.decompress(raw)
|
||||||
|
except zlib.error:
|
||||||
|
continue
|
||||||
|
raw = raw[:length1]
|
||||||
|
if raw[:4] not in (b"\x00\x01\x00\x00", b"true", b"ttcf"):
|
||||||
|
continue # not TrueType; out of scope
|
||||||
|
|
||||||
|
big = tmp / f"font-{objnum.decode()}.ttf"
|
||||||
|
small = tmp / f"font-{objnum.decode()}-sub.ttf"
|
||||||
|
big.write_bytes(raw)
|
||||||
|
r = run(["pyftsubset", str(big), f"--unicodes-file={uni}",
|
||||||
|
"--retain-gids", "--notdef-outline",
|
||||||
|
f"--output-file={small}"])
|
||||||
|
if r.returncode != 0 or not small.exists():
|
||||||
|
continue
|
||||||
|
sub = small.read_bytes()
|
||||||
|
if len(sub) >= length1:
|
||||||
|
continue # no win
|
||||||
|
|
||||||
|
header = data[m.start():m.end()]
|
||||||
|
new_header = re.sub(rb"/Length1\s+\d+",
|
||||||
|
b"/Length1 %d" % len(sub), header)
|
||||||
|
# stream must be stored uncompressed for fix-qdf to recount /Length;
|
||||||
|
# strip any filter entry for this stream
|
||||||
|
new_header = re.sub(rb"/Filter\s*/FlateDecode", b"", new_header)
|
||||||
|
# the newline before 'endstream' is part of QDF's line conventions
|
||||||
|
# that fix-qdf depends on -- it must be restored after the splice
|
||||||
|
data = data[:m.start()] + new_header + sub + b"\n" + data[end:]
|
||||||
|
changed = True
|
||||||
|
|
||||||
|
if not changed:
|
||||||
|
return False
|
||||||
|
|
||||||
|
spliced = tmp / "spliced.pdf"
|
||||||
|
spliced.write_bytes(data)
|
||||||
|
fixed = tmp / "fixed.pdf"
|
||||||
|
with open(fixed, "wb") as fh:
|
||||||
|
r = subprocess.run(["fix-qdf", str(spliced)], stdout=fh)
|
||||||
|
if r.returncode != 0:
|
||||||
|
return False
|
||||||
|
return recompress(fixed, dst)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser(prog="pdf-slim")
|
||||||
|
ap.add_argument("input")
|
||||||
|
ap.add_argument("output", nargs="?")
|
||||||
|
ap.add_argument("--dpi", type=int, default=200,
|
||||||
|
help="verification render resolution (default 200)")
|
||||||
|
ap.add_argument("--min-font-bytes", type=int, default=150000,
|
||||||
|
help="re-subset embedded TrueType fonts larger than this")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
for tool in ("mutool", "qpdf", "fix-qdf", "pyftsubset"):
|
||||||
|
need(tool)
|
||||||
|
|
||||||
|
inp = Path(args.input).expanduser()
|
||||||
|
if not inp.is_file():
|
||||||
|
sys.exit(f"pdf-slim: no such file: {inp}")
|
||||||
|
out = Path(args.output).expanduser() if args.output \
|
||||||
|
else inp.with_name(inp.stem + "-slim.pdf")
|
||||||
|
if out.resolve() == inp.resolve():
|
||||||
|
sys.exit("pdf-slim: refusing to overwrite the input file")
|
||||||
|
|
||||||
|
orig_size = inp.stat().st_size
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory(prefix="pdf-slim-") as td:
|
||||||
|
tmp = Path(td)
|
||||||
|
print(f"input: {inp} ({orig_size:,} bytes)")
|
||||||
|
baseline = render_hashes(inp, args.dpi, tmp, "orig")
|
||||||
|
if not baseline:
|
||||||
|
sys.exit("pdf-slim: could not render input for verification")
|
||||||
|
|
||||||
|
candidates = []
|
||||||
|
|
||||||
|
# candidate A: qpdf structural only (always render-safe, small win)
|
||||||
|
a = tmp / "a.pdf"
|
||||||
|
if recompress(inp, a):
|
||||||
|
candidates.append(("structural", a))
|
||||||
|
|
||||||
|
# candidate B: mutool clean with native subsetting
|
||||||
|
b_raw = tmp / "b-raw.pdf"
|
||||||
|
b = tmp / "b.pdf"
|
||||||
|
r = run(["mutool", "clean", "-gggg", "-z", "-f", "-i", "-c", "-S",
|
||||||
|
"-Z", str(inp), str(b_raw)])
|
||||||
|
if r.returncode == 0 and b_raw.exists() and recompress(b_raw, b):
|
||||||
|
candidates.append(("mutool-subset", b))
|
||||||
|
|
||||||
|
# candidate C: B plus manual re-subset of oversized TrueType fonts
|
||||||
|
c = tmp / "c.pdf"
|
||||||
|
base_for_c = b if b.exists() else inp
|
||||||
|
cps = doc_unicodes(inp, tmp)
|
||||||
|
if resubset_fonts(base_for_c, c, tmp, args.min_font_bytes, cps):
|
||||||
|
candidates.append(("font-resubset", c))
|
||||||
|
|
||||||
|
# pick the smallest candidate that renders identically
|
||||||
|
best = None
|
||||||
|
for name, path in sorted(candidates,
|
||||||
|
key=lambda t: t[1].stat().st_size):
|
||||||
|
hashes = render_hashes(path, args.dpi, tmp, "cand")
|
||||||
|
if hashes == baseline:
|
||||||
|
best = (name, path)
|
||||||
|
break
|
||||||
|
print(f" rejected {name}: rendering differs")
|
||||||
|
|
||||||
|
# idempotency: if no candidate wins meaningfully (>1% smaller),
|
||||||
|
# emit the input unchanged so repeated runs are stable
|
||||||
|
if best is None or best[1].stat().st_size >= orig_size * 0.99:
|
||||||
|
shutil.copyfile(inp, out)
|
||||||
|
print("already minimal; output is an unchanged copy of the input")
|
||||||
|
print(f"output: {out} ({orig_size:,} bytes, -0.0%)")
|
||||||
|
return
|
||||||
|
|
||||||
|
shutil.copyfile(best[1], out)
|
||||||
|
new_size = out.stat().st_size
|
||||||
|
pct = 100 * (orig_size - new_size) / orig_size
|
||||||
|
print(f"method: {best[0]} (pixel-identical at {args.dpi} dpi)")
|
||||||
|
print(f"output: {out} ({new_size:,} bytes, -{pct:.1f}%)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user