[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
This revision was automatically updated to reflect the committed changes. Closed by commit rL346969: [AST] Store the string data in StringLiteral in a trailing array of chars (authored by brunoricci, committed by ). Herald added a subscriber: llvm-commits. Changed prior to commit: https://reviews.llvm.org/D54166?vs=174032=174228#toc Repository: rL LLVM https://reviews.llvm.org/D54166 Files: cfe/trunk/include/clang/AST/Expr.h cfe/trunk/include/clang/AST/Stmt.h cfe/trunk/lib/AST/Expr.cpp cfe/trunk/lib/Serialization/ASTReaderStmt.cpp cfe/trunk/lib/Serialization/ASTWriterStmt.cpp Index: cfe/trunk/include/clang/AST/Expr.h === --- cfe/trunk/include/clang/AST/Expr.h +++ cfe/trunk/include/clang/AST/Expr.h @@ -1568,98 +1568,131 @@ /// char X[2] = "foobar"; /// In this case, getByteLength() will return 6, but the string literal will /// have type "char[2]". -class StringLiteral : public Expr { +class StringLiteral final +: public Expr, + private llvm::TrailingObjects { + friend class ASTStmtReader; + friend TrailingObjects; + + /// StringLiteral is followed by several trailing objects. They are in order: + /// + /// * A single unsigned storing the length in characters of this string. The + /// length in bytes is this length times the width of a single character. + /// Always present and stored as a trailing objects because storing it in + /// StringLiteral would increase the size of StringLiteral by sizeof(void *) + /// due to alignment requirements. If you add some data to StringLiteral, + /// consider moving it inside StringLiteral. + /// + /// * An array of getNumConcatenated() SourceLocation, one for each of the + /// token this string is made of. + /// + /// * An array of getByteLength() char used to store the string data. + public: enum StringKind { Ascii, Wide, UTF8, UTF16, UTF32 }; private: - friend class ASTStmtReader; + unsigned numTrailingObjects(OverloadToken) const { return 1; } + unsigned numTrailingObjects(OverloadToken) const { +return getNumConcatenated(); + } - union { -const char *asChar; -const uint16_t *asUInt16; -const uint32_t *asUInt32; - } StrData; - unsigned Length; - unsigned CharByteWidth : 4; - unsigned Kind : 3; - unsigned IsPascal : 1; - unsigned NumConcatenated; - SourceLocation TokLocs[1]; - - StringLiteral(QualType Ty) : -Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary, false, false, false, - false) {} + unsigned numTrailingObjects(OverloadToken) const { +return getByteLength(); + } + + char *getStrDataAsChar() { return getTrailingObjects(); } + const char *getStrDataAsChar() const { return getTrailingObjects(); } + + const uint16_t *getStrDataAsUInt16() const { +return reinterpret_cast(getTrailingObjects()); + } + + const uint32_t *getStrDataAsUInt32() const { +return reinterpret_cast(getTrailingObjects()); + } + + /// Build a string literal. + StringLiteral(const ASTContext , StringRef Str, StringKind Kind, +bool Pascal, QualType Ty, const SourceLocation *Loc, +unsigned NumConcatenated); + + /// Build an empty string literal. + StringLiteral(EmptyShell Empty, unsigned NumConcatenated, unsigned Length, +unsigned CharByteWidth); /// Map a target and string kind to the appropriate character width. static unsigned mapCharByteWidth(TargetInfo const , StringKind SK); + /// Set one of the string literal token. + void setStrTokenLoc(unsigned TokNum, SourceLocation L) { +assert(TokNum < getNumConcatenated() && "Invalid tok number"); +getTrailingObjects()[TokNum] = L; + } + public: /// This is the "fully general" constructor that allows representation of /// strings formed from multiple concatenated tokens. - static StringLiteral *Create(const ASTContext , StringRef Str, + static StringLiteral *Create(const ASTContext , StringRef Str, StringKind Kind, bool Pascal, QualType Ty, - const SourceLocation *Loc, unsigned NumStrs); + const SourceLocation *Loc, + unsigned NumConcatenated); /// Simple constructor for string literals made from one token. - static StringLiteral *Create(const ASTContext , StringRef Str, + static StringLiteral *Create(const ASTContext , StringRef Str, StringKind Kind, bool Pascal, QualType Ty, SourceLocation Loc) { -return Create(C, Str, Kind, Pascal, Ty, , 1); +return Create(Ctx, Str, Kind, Pascal, Ty, , 1); } /// Construct an empty string literal. - static StringLiteral *CreateEmpty(const ASTContext , unsigned NumStrs); + static StringLiteral *CreateEmpty(const ASTContext , +unsigned NumConcatenated, unsigned Length, +
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
riccibruno added a comment. In https://reviews.llvm.org/D54166#1299230, @rjmccall wrote: > In https://reviews.llvm.org/D54166#1299149, @riccibruno wrote: > > > In https://reviews.llvm.org/D54166#1298889, @rjmccall wrote: > > > > > IIRC, abbreviations just silently don't take effect if the record doesn't > > > conform; so things will appear to work, but the size on disk will be > > > bigger. > > > > > > I looked at where the abbreviations are defined, and it seems that the only > > abbreviations for > > statements/expressions are for `DeclRefExpr`, `IntegerLiteral`, > > `CharacterLiteral` > > and `ImplicitCastExpr` (grep for `EmitAbbrev` in `Serialization/`, > > for some reasons they are emitted in `WriteDeclAbbrev()`...). > > > > And indeed changing the serialization format of `CharacterLiteral` triggers > > various assertions > > because of the abbreviation. Therefore unless I am missing something no > > other statement/expression > > has currently an abbreviation. I suspect therefore that someone could go > > wild and cut the on-disk > > size of the serialization format significantly here. > > > > I looked at the size of the generated pch for all of Boost, and I am only > > seeing an increase of > > about 8k, which is entirely attributable to the fact that I am adding one > > field to the serialization > > format. I can rework it to remove this additional field if needed. > > > If you're generally interested in improving build times, working on the > serialization format would probably be really valuable. We don't generally > insist on a zero-regressions policy in other commits, though. I was planning to finish packing the other expression classes to minimize the in-memory footprint of the AST first. I will try to take a look at serialization if I have some time but no promise for now. Thanks for the review ! Repository: rC Clang https://reviews.llvm.org/D54166 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
rjmccall accepted this revision. rjmccall added a comment. This revision is now accepted and ready to land. LGTM. Repository: rC Clang https://reviews.llvm.org/D54166 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
rjmccall added a comment. In https://reviews.llvm.org/D54166#1299149, @riccibruno wrote: > In https://reviews.llvm.org/D54166#1298889, @rjmccall wrote: > > > IIRC, abbreviations just silently don't take effect if the record doesn't > > conform; so things will appear to work, but the size on disk will be bigger. > > > I looked at where the abbreviations are defined, and it seems that the only > abbreviations for > statements/expressions are for `DeclRefExpr`, `IntegerLiteral`, > `CharacterLiteral` > and `ImplicitCastExpr` (grep for `EmitAbbrev` in `Serialization/`, > for some reasons they are emitted in `WriteDeclAbbrev()`...). > > And indeed changing the serialization format of `CharacterLiteral` triggers > various assertions > because of the abbreviation. Therefore unless I am missing something no > other statement/expression > has currently an abbreviation. I suspect therefore that someone could go > wild and cut the on-disk > size of the serialization format significantly here. > > I looked at the size of the generated pch for all of Boost, and I am only > seeing an increase of > about 8k, which is entirely attributable to the fact that I am adding one > field to the serialization > format. I can rework it to remove this additional field if needed. If you're generally interested in improving build times, working on the serialization format would probably be really valuable. We don't generally insist on a zero-regressions policy in other commits, though. Repository: rC Clang https://reviews.llvm.org/D54166 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
riccibruno marked 2 inline comments as done. riccibruno added a comment. In https://reviews.llvm.org/D54166#1298889, @rjmccall wrote: > IIRC, abbreviations just silently don't take effect if the record doesn't > conform; so things will appear to work, but the size on disk will be bigger. I looked at where the abbreviations are defined, and it seems that the only abbreviations for statements/expressions are for `DeclRefExpr`, `IntegerLiteral`, `CharacterLiteral` and `ImplicitCastExpr` (grep for `EmitAbbrev` in `Serialization/`, for some reasons they are emitted in `WriteDeclAbbrev()`...). And indeed changing the serialization format of `CharacterLiteral` triggers various assertions because of the abbreviation. Therefore unless I am missing something no other statement/expression has currently an abbreviation. I suspect therefore that someone could go wild and cut the on-disk size of the serialization format significantly here. I looked at the size of the generated pch for all of Boost, and I am only seeing an increase of about 8k, which is entirely attributable to the fact that I am adding one field to the serialization format. I can rework it to remove this additional field if needed. Repository: rC Clang https://reviews.llvm.org/D54166 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
rjmccall added a comment. IIRC, abbreviations just silently don't take effect if the record doesn't conform; so things will appear to work, but the size on disk will be bigger. Comment at: include/clang/AST/Expr.h:1615 + } + + /// Build a string literal. riccibruno wrote: > Note that the trailing array of chars is aligned to 4 bytes > since it is after the array of `SourceLocation`. > Therefore I believe that the `uint16_t *` and `uint32_t *` > point to properly aligned memory. However I can add an > assertion here if needed. I think it's fine. Repository: rC Clang https://reviews.llvm.org/D54166 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
riccibruno marked 2 inline comments as done. riccibruno added inline comments. Comment at: include/clang/AST/Expr.h:1615 + } + + /// Build a string literal. Note that the trailing array of chars is aligned to 4 bytes since it is after the array of `SourceLocation`. Therefore I believe that the `uint16_t *` and `uint32_t *` point to properly aligned memory. However I can add an assertion here if needed. Repository: rC Clang https://reviews.llvm.org/D54166 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
riccibruno updated this revision to Diff 174032. riccibruno added a comment. Factored out some NFSs which I will submit separately. Repository: rC Clang https://reviews.llvm.org/D54166 Files: include/clang/AST/Expr.h include/clang/AST/Stmt.h lib/AST/Expr.cpp lib/Serialization/ASTReaderStmt.cpp lib/Serialization/ASTWriterStmt.cpp Index: lib/Serialization/ASTWriterStmt.cpp === --- lib/Serialization/ASTWriterStmt.cpp +++ lib/Serialization/ASTWriterStmt.cpp @@ -518,17 +518,23 @@ void ASTStmtWriter::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - Record.push_back(E->getByteLength()); + + // Store the various bits of data of StringLiteral. Record.push_back(E->getNumConcatenated()); + Record.push_back(E->getLength()); + Record.push_back(E->getCharByteWidth()); Record.push_back(E->getKind()); Record.push_back(E->isPascal()); - // FIXME: String data should be stored as a blob at the end of the - // StringLiteral. However, we can't do so now because we have no - // provision for coping with abbreviations when we're jumping around - // the AST file during deserialization. - Record.append(E->getBytes().begin(), E->getBytes().end()); + + // Store the trailing array of SourceLocation. for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) Record.AddSourceLocation(E->getStrTokenLoc(I)); + + // Store the trailing array of char holding the string data. + StringRef StrData = E->getBytes(); + for (unsigned I = 0, N = E->getByteLength(); I != N; ++I) +Record.push_back(StrData[I]); + Code = serialization::EXPR_STRING_LITERAL; } Index: lib/Serialization/ASTReaderStmt.cpp === --- lib/Serialization/ASTReaderStmt.cpp +++ lib/Serialization/ASTReaderStmt.cpp @@ -595,22 +595,35 @@ void ASTStmtReader::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - unsigned Len = Record.readInt(); - assert(Record.peekInt() == E->getNumConcatenated() && - "Wrong number of concatenated tokens!"); - Record.skipInts(1); - auto kind = static_cast(Record.readInt()); - bool isPascal = Record.readInt(); - // Read string data - auto B = (); - SmallString<16> Str(B, B + Len); - E->setString(Record.getContext(), Str, kind, isPascal); - Record.skipInts(Len); - - // Read source locations - for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) + // NumConcatenated, Length and CharByteWidth are set by the empty + // ctor since they are needed to allocate storage for the trailing objects. + unsigned NumConcatenated = Record.readInt(); + unsigned Length = Record.readInt(); + unsigned CharByteWidth = Record.readInt(); + assert((NumConcatenated == E->getNumConcatenated()) && + "Wrong number of concatenated tokens!"); + assert((Length == E->getLength()) && "Wrong Length!"); + assert((CharByteWidth == E->getCharByteWidth()) && "Wrong character width!"); + E->StringLiteralBits.Kind = Record.readInt(); + E->StringLiteralBits.IsPascal = Record.readInt(); + + // The character width is originally computed via mapCharByteWidth. + // Check that the deserialized character width is consistant with the result + // of calling mapCharByteWidth. + assert((CharByteWidth == + StringLiteral::mapCharByteWidth(Record.getContext().getTargetInfo(), + E->getKind())) && + "Wrong character width!"); + + // Deserialize the trailing array of SourceLocation. + for (unsigned I = 0; I < NumConcatenated; ++I) E->setStrTokenLoc(I, ReadSourceLocation()); + + // Deserialize the trailing array of char holding the string data. + char *StrData = E->getStrDataAsChar(); + for (unsigned I = 0; I < Length * CharByteWidth; ++I) +StrData[I] = Record.readInt(); } void ASTStmtReader::VisitCharacterLiteral(CharacterLiteral *E) { @@ -2423,8 +2436,11 @@ break; case EXPR_STRING_LITERAL: - S = StringLiteral::CreateEmpty(Context, - Record[ASTStmtReader::NumExprFields + 1]); + S = StringLiteral::CreateEmpty( + Context, + /* NumConcatenated=*/Record[ASTStmtReader::NumExprFields + 0], + /* Length=*/Record[ASTStmtReader::NumExprFields + 1], + /* CharByteWidth=*/Record[ASTStmtReader::NumExprFields + 2]); break; case EXPR_CHARACTER_LITERAL: Index: lib/AST/Expr.cpp === --- lib/AST/Expr.cpp +++ lib/AST/Expr.cpp @@ -912,42 +912,80 @@ return CharByteWidth; } -StringLiteral *StringLiteral::Create(const ASTContext , StringRef Str, - StringKind Kind, bool Pascal, QualType Ty, - const SourceLocation *Loc, - unsigned NumStrs) { - assert(C.getAsConstantArrayType(Ty) && +StringLiteral::StringLiteral(const
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
dblaikie added a comment. In https://reviews.llvm.org/D54166#1295730, @riccibruno wrote: > @dblaikie Thanks for looking at this patch ! > > I have a set of patches shrinking the other statements/expressions. > Can I add you to review some of these too ? Most of them consists of just > moving > some data. I added rsmith but I think that he is busy with the standard. Sure thing! do pull out as many bits of cleanup, renaming, refactoring, etc, into separate patches (no worries if you have to also send those for review (not sure if you have commit access/what level of stuff you want to review-after-commit, etc) - still much easier to review in little pieces than all together) Repository: rC Clang https://reviews.llvm.org/D54166 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
riccibruno added a comment. @dblaikie Thanks for looking at this patch ! I have a set of patches shrinking the other statements/expressions. Can I add you to review some of these too ? Most of them consists of just moving some data. I added rsmith but I think that he is busy with the standard. Comment at: include/clang/AST/Expr.h:1701 +for (auto c : getString()) + if (!isASCII(c)) return true; They are not needed, I just could not resist. I can factor these changes out of this patch and submit it as an NFC. Repository: rC Clang https://reviews.llvm.org/D54166 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
dblaikie added inline comments. Comment at: include/clang/AST/Expr.h:1700-1701 bool containsNonAscii() const { -StringRef Str = getString(); -for (unsigned i = 0, e = Str.size(); i != e; ++i) - if (!isASCII(Str[i])) +for (auto c : getString()) + if (!isASCII(c)) return true; Seems like changes like this are unrelated refactoring? Or are they in some way needed for this change? (also the isXXX functions above - might be neat/tidy to pull those out separately (as an NFC change) so that then this change (that modifies the implementation of getKind) is just that, without having to refactor all the other bits too? But not a big deal either way there, really) Repository: rC Clang https://reviews.llvm.org/D54166 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D54166: [AST] Store the string data in StringLiteral in a trailing array of chars
riccibruno created this revision. riccibruno added a reviewer: rsmith. riccibruno added a project: clang. Herald added a subscriber: cfe-commits. Use the newly available space in the bit-fields of `Stmt` and store the string data in a trailing array of `char`s after the trailing array of `SourceLocation`. This cuts the size of `StringLiteral` by 2 pointers. Also refactor slightly `StringLiteral::Create` and `StringLiteral::CreateEmpty` so that `StringLiteral::Create` is just responsible for the allocation, and the constructor is responsible for doing all the initialization. This match what is done for the other classes in general. Two points I am wondering about: The original version used type punning through an union. Here I am just `reinterpret_cast`ing back and forth between `char *` and `uint16_t *`/`uint32_t *`. There is a FIXME in `ASTWriterStmt.cpp` (see the end of the diff) which says that storing the string past the `StringLiteral` would cause problems with abbreviations. The note dates from 2009 and everything *seems* to be working just fine. However I am not familiar with this and it is possible I am missing something here. Repository: rC Clang https://reviews.llvm.org/D54166 Files: include/clang/AST/Expr.h include/clang/AST/Stmt.h lib/AST/Expr.cpp lib/Serialization/ASTReaderStmt.cpp lib/Serialization/ASTWriterStmt.cpp Index: lib/Serialization/ASTWriterStmt.cpp === --- lib/Serialization/ASTWriterStmt.cpp +++ lib/Serialization/ASTWriterStmt.cpp @@ -527,17 +527,23 @@ void ASTStmtWriter::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - Record.push_back(E->getByteLength()); + + // Store the various bits of data of StringLiteral. Record.push_back(E->getNumConcatenated()); + Record.push_back(E->getLength()); + Record.push_back(E->getCharByteWidth()); Record.push_back(E->getKind()); Record.push_back(E->isPascal()); - // FIXME: String data should be stored as a blob at the end of the - // StringLiteral. However, we can't do so now because we have no - // provision for coping with abbreviations when we're jumping around - // the AST file during deserialization. - Record.append(E->getBytes().begin(), E->getBytes().end()); + + // Store the trailing array of SourceLocation. for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) Record.AddSourceLocation(E->getStrTokenLoc(I)); + + // Store the trailing array of char holding the string data. + StringRef StrData = E->getBytes(); + for (unsigned I = 0, N = E->getByteLength(); I != N; ++I) +Record.push_back(StrData[I]); + Code = serialization::EXPR_STRING_LITERAL; } Index: lib/Serialization/ASTReaderStmt.cpp === --- lib/Serialization/ASTReaderStmt.cpp +++ lib/Serialization/ASTReaderStmt.cpp @@ -602,22 +602,35 @@ void ASTStmtReader::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - unsigned Len = Record.readInt(); - assert(Record.peekInt() == E->getNumConcatenated() && - "Wrong number of concatenated tokens!"); - Record.skipInts(1); - auto kind = static_cast(Record.readInt()); - bool isPascal = Record.readInt(); - // Read string data - auto B = (); - SmallString<16> Str(B, B + Len); - E->setString(Record.getContext(), Str, kind, isPascal); - Record.skipInts(Len); - - // Read source locations - for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) + // NumConcatenated, Length and CharByteWidth are set by the empty + // ctor since they are needed to allocate storage for the trailing objects. + unsigned NumConcatenated = Record.readInt(); + unsigned Length = Record.readInt(); + unsigned CharByteWidth = Record.readInt(); + assert((NumConcatenated == E->getNumConcatenated()) && + "Wrong number of concatenated tokens!"); + assert((Length == E->getLength()) && "Wrong Length!"); + assert((CharByteWidth == E->getCharByteWidth()) && "Wrong character width!"); + E->StringLiteralBits.Kind = Record.readInt(); + E->StringLiteralBits.IsPascal = Record.readInt(); + + // The character width is originally computed via mapCharByteWidth. + // Check that the deserialized character width is consistant with the result + // of calling mapCharByteWidth. + assert((CharByteWidth == + StringLiteral::mapCharByteWidth(Record.getContext().getTargetInfo(), + E->getKind())) && + "Wrong character width!"); + + // Deserialize the trailing array of SourceLocation. + for (unsigned I = 0; I < NumConcatenated; ++I) E->setStrTokenLoc(I, ReadSourceLocation()); + + // Deserialize the trailing array of char holding the string data. + char *StrData = E->getStrDataAsChar(); + for (unsigned I = 0; I < Length * CharByteWidth; ++I) +StrData[I] = Record.readInt(); } void ASTStmtReader::VisitCharacterLiteral(CharacterLiteral *E) { @@ -2433,8 +2446,11 @@