diff --git a/Meta/test_pdf.py b/Meta/test_pdf.py new file mode 100755 index 0000000000..1749b84b18 --- /dev/null +++ b/Meta/test_pdf.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +'''Runs `pdf --debugging-stats` on a bunch of PDF files in parallel. + +Give it one or more folders containing PDF files, and the optional -n flag +to pick a random subset of n PDFs: + + test_pdf.py -n 200 ~/Downloads/0000 ~/src/pdffiles + +https://pdfa.org/new-large-scale-pdf-corpus-now-publicly-available/ has +8 TB of test PDFs, organized in a bunch of zip files with 1000 PDFs each. +One of those zip files in unzipped makes for a good input folder. +''' + +import argparse +import collections +import glob +import multiprocessing +import os +import random +import re +import subprocess + + +Result = collections.namedtuple( + 'Result', ['filename', 'returncode', 'stdout', 'stderr']) + + +def elide_aslr(s): + return re.sub(rb'\b0x[0-9a-f]+\b', b'0xc3ns0r3d', s) + + +def test_pdf(filename): + pdf_path = os.path.join(os.path.dirname(__file__), '../Build/lagom/bin/pdf') + r = subprocess.run([pdf_path, '--debugging-stats', filename], + capture_output=True) + return Result(filename, r.returncode, r.stdout, elide_aslr(r.stderr)) + + +def main(): + parser = argparse.ArgumentParser( + epilog=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument('input', nargs='+', help='input directories') + parser.add_argument('-n', type=int, help='render at most n pdfs') + args = parser.parse_args() + + files = [] + for input_directory in args.input: + files += glob.glob(os.path.join(input_directory, '*.pdf')) + if args.n is not None: + random.seed(42) + files = random.sample(files, k=args.n) + + results = multiprocessing.Pool().map(test_pdf, files) + + num_crashes = 0 + stack_to_files = {} + for r in results: + print(r.filename) + print(r.stdout.decode('utf-8')) + if r.returncode != 0: + num_crashes += 1 + stack_to_files.setdefault(r.stderr, []).append(r.filename) + + print('Top 5 crashiest stacks') + keys = list(stack_to_files.keys()) + keys.sort(key=lambda x: len(stack_to_files[x]), reverse=True) + for stack in reversed(keys[:5]): + files = stack_to_files[stack] + print(stack.decode('utf-8'), end='') + print(f'In {len(files)} files:') + for file in files: + print(f' {file}') + print() + + print(f'{len(keys)} distinct crash stacks') + + percent = 100 * num_crashes / len(results) + print(f'{num_crashes} crashes ({percent:.1f}%)') + + +if __name__ == '__main__': + main()