mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-30 05:42:37 +00:00 
			
		
		
		
	LibJS: Combine UTF-16 surrogate pairs when concatenating strings
In the following use case:
    "\ud834" + "\udf06"
We were previously combining these as two individual code points. When
concatenating strings, we must take care to combine the high surrogate
from the left-hand side with the low surrogate from the right-hand side.
			
			
This commit is contained in:
		
							parent
							
								
									ab02e3981e
								
							
						
					
					
						commit
						a57e2f9a76
					
				
					 2 changed files with 78 additions and 21 deletions
				
			
		|  | @ -978,6 +978,51 @@ ThrowCompletionOr<Value> unsigned_right_shift(GlobalObject& global_object, Value | |||
|     return vm.throw_completion<TypeError>(global_object, ErrorType::BigIntBadOperator, "unsigned right-shift"); | ||||
| } | ||||
| 
 | ||||
| // https://tc39.es/ecma262/#string-concatenation
 | ||||
| static PrimitiveString* concatenate_strings(GlobalObject& global_object, PrimitiveString const& lhs, PrimitiveString const& rhs) | ||||
| { | ||||
|     auto& vm = global_object.vm(); | ||||
| 
 | ||||
|     if (lhs.has_utf16_string() && rhs.has_utf16_string()) { | ||||
|         auto const& lhs_string = lhs.utf16_string(); | ||||
|         auto const& rhs_string = rhs.utf16_string(); | ||||
| 
 | ||||
|         Vector<u16, 1> combined; | ||||
|         combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units()); | ||||
|         combined.extend(lhs_string.string()); | ||||
|         combined.extend(rhs_string.string()); | ||||
| 
 | ||||
|         return js_string(vm, Utf16String(move(combined))); | ||||
|     } | ||||
| 
 | ||||
|     Utf8View lhs_string { lhs.string() }; | ||||
|     Utf8View rhs_string { rhs.string() }; | ||||
| 
 | ||||
|     StringBuilder builder(lhs_string.length() + rhs_string.length()); | ||||
|     Optional<u16> high_surrogate; | ||||
| 
 | ||||
|     for (auto it = lhs_string.begin(); it != lhs_string.end(); ++it) { | ||||
|         if (!it.peek(1).has_value() && Utf16View::is_high_surrogate(*it) && !rhs_string.is_empty()) | ||||
|             high_surrogate = *it; | ||||
|         else | ||||
|             builder.append_code_point(*it); | ||||
|     } | ||||
| 
 | ||||
|     if (high_surrogate.has_value()) { | ||||
|         auto low_surrogate = *rhs_string.begin(); | ||||
| 
 | ||||
|         if (Utf16View::is_low_surrogate(low_surrogate)) { | ||||
|             builder.append_code_point(Utf16View::decode_surrogate_pair(*high_surrogate, low_surrogate)); | ||||
|             rhs_string = rhs_string.substring_view(3); // A low surrogate encoded as UTF-8 is 3 bytes.
 | ||||
|         } else { | ||||
|             builder.append_code_point(*high_surrogate); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     builder.append(rhs_string.as_string()); | ||||
|     return js_string(vm, builder.to_string()); | ||||
| } | ||||
| 
 | ||||
| // 13.8.1 The Addition Operator ( + ), https://tc39.es/ecma262/#sec-addition-operator-plus
 | ||||
| ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs) | ||||
| { | ||||
|  | @ -995,28 +1040,10 @@ ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs) | |||
|     auto lhs_primitive = TRY(lhs.to_primitive(global_object)); | ||||
|     auto rhs_primitive = TRY(rhs.to_primitive(global_object)); | ||||
| 
 | ||||
|     if (lhs_primitive.is_string() && rhs_primitive.is_string()) { | ||||
|         auto const& lhs_string = lhs_primitive.as_string(); | ||||
|         auto const& rhs_string = rhs_primitive.as_string(); | ||||
| 
 | ||||
|         if (lhs_string.has_utf16_string() && rhs_string.has_utf16_string()) { | ||||
|             auto const& lhs_utf16_string = lhs_string.utf16_string(); | ||||
|             auto const& rhs_utf16_string = rhs_string.utf16_string(); | ||||
| 
 | ||||
|             Vector<u16, 1> combined; | ||||
|             combined.ensure_capacity(lhs_utf16_string.length_in_code_units() + rhs_utf16_string.length_in_code_units()); | ||||
|             combined.extend(lhs_utf16_string.string()); | ||||
|             combined.extend(rhs_utf16_string.string()); | ||||
|             return Value(js_string(vm.heap(), Utf16String(move(combined)))); | ||||
|         } | ||||
|     } | ||||
|     if (lhs_primitive.is_string() || rhs_primitive.is_string()) { | ||||
|         auto lhs_string = TRY(lhs_primitive.to_string(global_object)); | ||||
|         auto rhs_string = TRY(rhs_primitive.to_string(global_object)); | ||||
|         StringBuilder builder(lhs_string.length() + rhs_string.length()); | ||||
|         builder.append(lhs_string); | ||||
|         builder.append(rhs_string); | ||||
|         return Value(js_string(vm, builder.to_string())); | ||||
|         auto lhs_string = TRY(lhs_primitive.to_primitive_string(global_object)); | ||||
|         auto rhs_string = TRY(rhs_primitive.to_primitive_string(global_object)); | ||||
|         return concatenate_strings(global_object, *lhs_string, *rhs_string); | ||||
|     } | ||||
| 
 | ||||
|     auto lhs_numeric = TRY(lhs_primitive.to_numeric(global_object)); | ||||
|  |  | |||
							
								
								
									
										30
									
								
								Userland/Libraries/LibJS/Tests/string-concatenation.js
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								Userland/Libraries/LibJS/Tests/string-concatenation.js
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,30 @@ | |||
| test("adding strings", () => { | ||||
|     expect("" + "").toBe(""); | ||||
|     expect("ab" + "").toBe("ab"); | ||||
|     expect("" + "cd").toBe("cd"); | ||||
|     expect("ab" + "cd").toBe("abcd"); | ||||
| }); | ||||
| 
 | ||||
| test("adding strings with non-strings", () => { | ||||
|     expect("a" + 1).toBe("a1"); | ||||
|     expect(1 + "a").toBe("1a"); | ||||
|     expect("a" + {}).toBe("a[object Object]"); | ||||
|     expect({} + "a").toBeNaN(); | ||||
|     expect("a" + []).toBe("a"); | ||||
|     expect([] + "a").toBe("a"); | ||||
|     expect("a" + NaN).toBe("aNaN"); | ||||
|     expect(NaN + "a").toBe("NaNa"); | ||||
|     expect(Array(16).join([[][[]] + []][+[]][++[+[]][+[]]] - 1) + " Batman!").toBe( | ||||
|         "NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN Batman!" | ||||
|     ); | ||||
| }); | ||||
| 
 | ||||
| test("adding strings with dangling surrogates", () => { | ||||
|     expect("\ud834" + "").toBe("\ud834"); | ||||
|     expect("" + "\udf06").toBe("\udf06"); | ||||
|     expect("\ud834" + "\udf06").toBe("𝌆"); | ||||
|     expect("\ud834" + "\ud834").toBe("\ud834\ud834"); | ||||
|     expect("\udf06" + "\udf06").toBe("\udf06\udf06"); | ||||
|     expect("\ud834a" + "\udf06").toBe("\ud834a\udf06"); | ||||
|     expect("\ud834" + "a\udf06").toBe("\ud834a\udf06"); | ||||
| }); | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Flynn
						Timothy Flynn