I've been using AngelScript for over ten years. Congratulations on having a 20-year product!
I recently found a bug in the tokenizer code when processing a non-terminated heredoc string. The tokenization of a normal string constant only exits with tokenType=ttStringConstant if a closing quotation mark is found. But the heredoc will return with tokenType=ttHeredocStringConstant if it simply exhausts the source, which is very likely (I think) with a non-terminated heredoc.
I propose the following change to fix the bug. The introduction of a new tokenType of ttNonTerminatedHeredocStringConstant isn't necessary, but I've included it here just to mirror the normal string processing. I realize adding this token type would require changes in other files. A tokenType of ttNonTerminatedStringConstant should work fine.
Original (as_tokenizer.cpp line 338)
// String constant between double or single quotes
if( source[0] == '"' || source[0] == '\'' )
{
// Is it a normal string constant or a heredoc string constant?
if( sourceLength >= 6 && source[0] == '"' && source[1] == '"' && source[2] == '"' )
{
// Heredoc string constant (spans multiple lines, no escape sequences)
// Find the length
size_t n;
for( n = 3; n < sourceLength-2; n++ )
{
if( source[n] == '"' && source[n+1] == '"' && source[n+2] == '"' )
break;
}
tokenType = ttHeredocStringConstant;
tokenLength = n+3;
}
else
{
// Normal string constant
tokenType = ttStringConstant;
char quote = source[0];
bool evenSlashes = true;
size_t n;
for( n = 1; n < sourceLength; n++ )
{
#ifdef AS_DOUBLEBYTE_CHARSET
// Double-byte characters are only allowed for ASCII
if( (source[n] & 0x80) && engine->ep.scanner == 0 )
{
// This is a leading character in a double byte character,
// include both in the string and continue processing.
n++;
continue;
}
#endif
if( source[n] == '\n' )
tokenType = ttMultilineStringConstant;
if( source[n] == quote && evenSlashes )
{
tokenLength = n+1;
return true;
}
if( source[n] == '\\' ) evenSlashes = !evenSlashes; else evenSlashes = true;
}
tokenType = ttNonTerminatedStringConstant;
tokenLength = n;
}
return true;
}
Proposal:
// String constant between double or single quotes
if( source[0] == '"' || source[0] == '\'' )
{
// Is it a normal string constant or a heredoc string constant?
if( sourceLength >= 4 && source[0] == '"' && source[1] == '"' && source[2] == '"' )
{
// Heredoc string constant (spans multiple lines, no escape sequences)
// Find the length
size_t n;
for( n = 3; n < sourceLength-2; n++ )
{
if( source[n] == '"' && source[n+1] == '"' && source[n+2] == '"' )
{
tokenType = ttHeredocStringConstant;
tokenLength = n+3;
return true;
}
}
tokenType = ttNonTerminatedHeredocStringConstant;
tokenLength = n+2;
}
else
{
// Normal string constant
tokenType = ttStringConstant;
char quote = source[0];
bool evenSlashes = true;
size_t n;
for( n = 1; n < sourceLength; n++ )
{
#ifdef AS_DOUBLEBYTE_CHARSET
// Double-byte characters are only allowed for ASCII
if( (source[n] & 0x80) && engine->ep.scanner == 0 )
{
// This is a leading character in a double byte character,
// include both in the string and continue processing.
n++;
continue;
}
#endif
if( source[n] == '\n' )
tokenType = ttMultilineStringConstant;
if( source[n] == quote && evenSlashes )
{
tokenLength = n+1;
return true;
}
if( source[n] == '\\' ) evenSlashes = !evenSlashes; else evenSlashes = true;
}
tokenType = ttNonTerminatedStringConstant;
tokenLength = n;
}
return true;
}