mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 20:42:43 +00:00 
			
		
		
		
	 3fe9f8e48d
			
		
	
	
		3fe9f8e48d
		
	
	
	
	
		
			
			A page's /Contents can be an array of streams, and the page's contents are then as if those streams are concatenated. Most of the time, a stream ends with whitespace. But in some cases (e.g. 0000642.pdf from 0000.zip from the pdfa dataset), the first stream ends with an operator (`Q`) and the next stream starts with one (`q`), and the concatenation would form a new, unkonwn operator (`Qq`). Separate the streams' contents with a space to prevent that. Reduces numbers of PDF files we fail to open in the -n 500 case from 11 to 10 (in either case, we then crash on 18 of the PDFs that we do manage to open).
		
			
				
	
	
		
			37 lines
		
	
	
	
		
			1.3 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			37 lines
		
	
	
	
		
			1.3 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org>
 | |
|  *
 | |
|  * SPDX-License-Identifier: BSD-2-Clause
 | |
|  */
 | |
| 
 | |
| #include <LibPDF/Document.h>
 | |
| #include <LibPDF/ObjectDerivatives.h>
 | |
| #include <LibPDF/Page.h>
 | |
| 
 | |
| namespace PDF {
 | |
| 
 | |
| PDFErrorOr<ByteBuffer> Page::page_contents(Document& document) const
 | |
| {
 | |
|     // Table 3.27 Entries in a page object on Contents:
 | |
|     // "If this entry is absent, the page is empty. [...]"
 | |
|     if (contents.is_null())
 | |
|         return ByteBuffer {};
 | |
| 
 | |
|     // "The value may be either a single stream or an array of streams. If the value
 | |
|     //  is an array, the effect is as if all the streams in the array were concatenated,
 | |
|     //  in order, to form a single stream. The division between streams may occur only at
 | |
|     //  the boundaries between lexical tokens"
 | |
|     if (contents->is<StreamObject>())
 | |
|         return TRY(ByteBuffer::copy(contents->cast<StreamObject>()->bytes()));
 | |
| 
 | |
|     // If one stream ends with (say) a `Q` and the next starts with `q`, that should be
 | |
|     // two distinct tokens. Insert spaces between stream contents to ensure that.
 | |
|     ByteBuffer byte_buffer;
 | |
|     for (auto& ref : *contents->cast<ArrayObject>()) {
 | |
|         TRY(byte_buffer.try_append(TRY(document.resolve_to<StreamObject>(ref))->bytes()));
 | |
|         TRY(byte_buffer.try_append(' '));
 | |
|     }
 | |
|     return byte_buffer;
 | |
| }
 | |
| 
 | |
| }
 |