[PATCH] D53363: [clangd] Encode Line/Column as a 32-bits integer.

2018-10-18 Thread Haojian Wu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rCTE344735: [clangd] Encode Line/Column as a 32-bits integer. 
(authored by hokein, committed by ).

Changed prior to commit:
  https://reviews.llvm.org/D53363?vs=170053=170064#toc

Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D53363

Files:
  clangd/FindSymbols.cpp
  clangd/XRefs.cpp
  clangd/index/Index.cpp
  clangd/index/Index.h
  clangd/index/Serialization.cpp
  clangd/index/SymbolCollector.cpp
  clangd/index/YAMLSerialization.cpp
  unittests/clangd/FileIndexTests.cpp
  unittests/clangd/IndexTests.cpp
  unittests/clangd/SymbolCollectorTests.cpp

Index: unittests/clangd/FileIndexTests.cpp
===
--- unittests/clangd/FileIndexTests.cpp
+++ unittests/clangd/FileIndexTests.cpp
@@ -32,10 +32,10 @@
 using testing::UnorderedElementsAre;
 
 MATCHER_P(RefRange, Range, "") {
-  return std::tie(arg.Location.Start.Line, arg.Location.Start.Column,
-  arg.Location.End.Line, arg.Location.End.Column) ==
- std::tie(Range.start.line, Range.start.character, Range.end.line,
-  Range.end.character);
+  return std::make_tuple(arg.Location.Start.line(), arg.Location.Start.column(),
+ arg.Location.End.line(), arg.Location.End.column()) ==
+ std::make_tuple(Range.start.line, Range.start.character,
+ Range.end.line, Range.end.character);
 }
 MATCHER_P(FileURI, F, "") { return arg.Location.FileURI == F; }
 MATCHER_P(DeclURI, U, "") { return arg.CanonicalDeclaration.FileURI == U; }
Index: unittests/clangd/IndexTests.cpp
===
--- unittests/clangd/IndexTests.cpp
+++ unittests/clangd/IndexTests.cpp
@@ -31,13 +31,28 @@
 
 MATCHER_P(Named, N, "") { return arg.Name == N; }
 MATCHER_P(RefRange, Range, "") {
-  return std::tie(arg.Location.Start.Line, arg.Location.Start.Column,
-  arg.Location.End.Line, arg.Location.End.Column) ==
- std::tie(Range.start.line, Range.start.character, Range.end.line,
-  Range.end.character);
+  return std::make_tuple(arg.Location.Start.line(), arg.Location.Start.column(),
+ arg.Location.End.line(), arg.Location.End.column()) ==
+ std::make_tuple(Range.start.line, Range.start.character,
+ Range.end.line, Range.end.character);
 }
 MATCHER_P(FileURI, F, "") { return arg.Location.FileURI == F; }
 
+TEST(SymbolLocation, Position) {
+  using Position = SymbolLocation::Position;
+  Position Pos;
+
+  Pos.setLine(1);
+  EXPECT_EQ(1u, Pos.line());
+  Pos.setColumn(2);
+  EXPECT_EQ(2u, Pos.column());
+
+  Pos.setLine(Position::MaxLine + 1); // overflow
+  EXPECT_EQ(Pos.line(), Position::MaxLine);
+  Pos.setColumn(Position::MaxColumn + 1); // overflow
+  EXPECT_EQ(Pos.column(), Position::MaxColumn);
+}
+
 TEST(SymbolSlab, FindAndIterate) {
   SymbolSlab::Builder B;
   B.insert(symbol("Z"));
Index: unittests/clangd/SymbolCollectorTests.cpp
===
--- unittests/clangd/SymbolCollectorTests.cpp
+++ unittests/clangd/SymbolCollectorTests.cpp
@@ -63,19 +63,19 @@
   return (arg.IncludeHeader == IncludeHeader) && (arg.References == References);
 }
 MATCHER_P(DeclRange, Pos, "") {
-  return std::tie(arg.CanonicalDeclaration.Start.Line,
-  arg.CanonicalDeclaration.Start.Column,
-  arg.CanonicalDeclaration.End.Line,
-  arg.CanonicalDeclaration.End.Column) ==
- std::tie(Pos.start.line, Pos.start.character, Pos.end.line,
-  Pos.end.character);
+  return std::make_tuple(arg.CanonicalDeclaration.Start.line(),
+ arg.CanonicalDeclaration.Start.column(),
+ arg.CanonicalDeclaration.End.line(),
+ arg.CanonicalDeclaration.End.column()) ==
+ std::make_tuple(Pos.start.line, Pos.start.character, Pos.end.line,
+ Pos.end.character);
 }
 MATCHER_P(DefRange, Pos, "") {
-  return std::tie(arg.Definition.Start.Line,
-  arg.Definition.Start.Column, arg.Definition.End.Line,
-  arg.Definition.End.Column) ==
- std::tie(Pos.start.line, Pos.start.character, Pos.end.line,
-  Pos.end.character);
+  return std::make_tuple(
+ arg.Definition.Start.line(), arg.Definition.Start.column(),
+ arg.Definition.End.line(), arg.Definition.End.column()) ==
+ std::make_tuple(Pos.start.line, Pos.start.character, Pos.end.line,
+ Pos.end.character);
 }
 MATCHER_P(RefCount, R, "") { return int(arg.References) == R; }
 MATCHER_P(ForCodeCompletion, IsIndexedForCodeCompletion, "") {
@@ -86,10 +86,10 @@
 MATCHER(RefRange, "") {
   const Ref  = testing::get<0>(arg);
   const Range  = testing::get<1>(arg);
-  return 

[PATCH] D53363: [clangd] Encode Line/Column as a 32-bits integer.

2018-10-18 Thread Haojian Wu via Phabricator via cfe-commits
hokein added a comment.

> In https://reviews.llvm.org/D53363#1267721, @hokein wrote:
> 
>> Yeah, I have a rough patch for it, using char* will save us ~50MB memory, 
>> which will lead to ~300 MB memory usage in total.
> 
> 
> For just the StringRef in SymbolLocation::Position, or for all our strings?
>  If the latter, that seems too low, as we should save strictly more than this 
> patch (unless I'm missing something about alignment/padding)

This is just for the StringRef in `Position`, I think it matches our estimation.

More details coming:

- 6.4 million refs,  6.4m * 40 bytes = ~250MB
- 0.3 million symbols, 0.3m * 248 bytes = ~70MB

The original size of `Ref` (before any space optimization) is 40 bytes, we save 
8 bytes per refs, 8*6.5 = ~50 MB; If we change all StringRef in `Symbol` to 
`char*`, which will save 64 bytes (8*8) per sym, 64*0.3v = ~19 MB, which 
doesn't gain as much as refs, as refs contributes more than 70% memory.




Comment at: clangd/index/Index.h:66
const SymbolLocation::Position ) {
-  return std::tie(L.Line, L.Column) == std::tie(R.Line, R.Column);
+  return std::make_tuple(L.line(), L.column()) ==
+ std::make_tuple(R.Line, R.Column);

sammccall wrote:
> Why only using the getters on one side?
Oops, I mis-thought the type of the right side LSP position, fixed.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D53363



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D53363: [clangd] Encode Line/Column as a 32-bits integer.

2018-10-18 Thread Haojian Wu via Phabricator via cfe-commits
hokein updated this revision to Diff 170053.
hokein marked 6 inline comments as done.
hokein added a comment.

Address review comments.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D53363

Files:
  clangd/FindSymbols.cpp
  clangd/XRefs.cpp
  clangd/index/Index.cpp
  clangd/index/Index.h
  clangd/index/Serialization.cpp
  clangd/index/SymbolCollector.cpp
  clangd/index/YAMLSerialization.cpp
  unittests/clangd/FileIndexTests.cpp
  unittests/clangd/IndexTests.cpp
  unittests/clangd/SymbolCollectorTests.cpp

Index: unittests/clangd/SymbolCollectorTests.cpp
===
--- unittests/clangd/SymbolCollectorTests.cpp
+++ unittests/clangd/SymbolCollectorTests.cpp
@@ -63,19 +63,19 @@
   return (arg.IncludeHeader == IncludeHeader) && (arg.References == References);
 }
 MATCHER_P(DeclRange, Pos, "") {
-  return std::tie(arg.CanonicalDeclaration.Start.Line,
-  arg.CanonicalDeclaration.Start.Column,
-  arg.CanonicalDeclaration.End.Line,
-  arg.CanonicalDeclaration.End.Column) ==
- std::tie(Pos.start.line, Pos.start.character, Pos.end.line,
-  Pos.end.character);
+  return std::make_tuple(arg.CanonicalDeclaration.Start.line(),
+ arg.CanonicalDeclaration.Start.column(),
+ arg.CanonicalDeclaration.End.line(),
+ arg.CanonicalDeclaration.End.column()) ==
+ std::make_tuple(Pos.start.line, Pos.start.character, Pos.end.line,
+ Pos.end.character);
 }
 MATCHER_P(DefRange, Pos, "") {
-  return std::tie(arg.Definition.Start.Line,
-  arg.Definition.Start.Column, arg.Definition.End.Line,
-  arg.Definition.End.Column) ==
- std::tie(Pos.start.line, Pos.start.character, Pos.end.line,
-  Pos.end.character);
+  return std::make_tuple(
+ arg.Definition.Start.line(), arg.Definition.Start.column(),
+ arg.Definition.End.line(), arg.Definition.End.column()) ==
+ std::make_tuple(Pos.start.line, Pos.start.character, Pos.end.line,
+ Pos.end.character);
 }
 MATCHER_P(RefCount, R, "") { return int(arg.References) == R; }
 MATCHER_P(ForCodeCompletion, IsIndexedForCodeCompletion, "") {
@@ -86,10 +86,10 @@
 MATCHER(RefRange, "") {
   const Ref  = testing::get<0>(arg);
   const Range  = testing::get<1>(arg);
-  return std::tie(Pos.Location.Start.Line, Pos.Location.Start.Column,
-  Pos.Location.End.Line, Pos.Location.End.Column) ==
- std::tie(Range.start.line, Range.start.character, Range.end.line,
-  Range.end.character);
+  return std::make_tuple(Pos.Location.Start.line(), Pos.Location.Start.column(),
+ Pos.Location.End.line(), Pos.Location.End.column()) ==
+ std::make_tuple(Range.start.line, Range.start.character,
+ Range.end.line, Range.end.character);
 }
 testing::Matcher &>
 HaveRanges(const std::vector Ranges) {
Index: unittests/clangd/IndexTests.cpp
===
--- unittests/clangd/IndexTests.cpp
+++ unittests/clangd/IndexTests.cpp
@@ -31,13 +31,28 @@
 
 MATCHER_P(Named, N, "") { return arg.Name == N; }
 MATCHER_P(RefRange, Range, "") {
-  return std::tie(arg.Location.Start.Line, arg.Location.Start.Column,
-  arg.Location.End.Line, arg.Location.End.Column) ==
- std::tie(Range.start.line, Range.start.character, Range.end.line,
-  Range.end.character);
+  return std::make_tuple(arg.Location.Start.line(), arg.Location.Start.column(),
+ arg.Location.End.line(), arg.Location.End.column()) ==
+ std::make_tuple(Range.start.line, Range.start.character,
+ Range.end.line, Range.end.character);
 }
 MATCHER_P(FileURI, F, "") { return arg.Location.FileURI == F; }
 
+TEST(SymbolLocation, Position) {
+  using Position = SymbolLocation::Position;
+  Position Pos;
+
+  Pos.setLine(1);
+  EXPECT_EQ(1u, Pos.line());
+  Pos.setColumn(2);
+  EXPECT_EQ(2u, Pos.column());
+
+  Pos.setLine(Position::MaxLine + 1); // overflow
+  EXPECT_EQ(Pos.line(), Position::MaxLine);
+  Pos.setColumn(Position::MaxColumn + 1); // overflow
+  EXPECT_EQ(Pos.column(), Position::MaxColumn);
+}
+
 TEST(SymbolSlab, FindAndIterate) {
   SymbolSlab::Builder B;
   B.insert(symbol("Z"));
Index: unittests/clangd/FileIndexTests.cpp
===
--- unittests/clangd/FileIndexTests.cpp
+++ unittests/clangd/FileIndexTests.cpp
@@ -32,10 +32,10 @@
 using testing::UnorderedElementsAre;
 
 MATCHER_P(RefRange, Range, "") {
-  return std::tie(arg.Location.Start.Line, arg.Location.Start.Column,
-  arg.Location.End.Line, arg.Location.End.Column) ==
- std::tie(Range.start.line, Range.start.character, Range.end.line,
- 

[PATCH] D53363: [clangd] Encode Line/Column as a 32-bits integer.

2018-10-18 Thread Sam McCall via Phabricator via cfe-commits
sammccall accepted this revision.
sammccall added a comment.
This revision is now accepted and ready to land.

Nice, just nits!

In https://reviews.llvm.org/D53363#1267721, @hokein wrote:

> Yeah, I have a rough patch for it, using char* will save us ~50MB memory, 
> which will lead to ~300 MB memory usage in total.


For just the StringRef in SymbolLocation::Position, or for all our strings?
If the latter, that seems too low, as we should save strictly more than this 
patch (unless I'm missing something about alignment/padding)




Comment at: clangd/index/Index.h:41
   struct Position {
-uint32_t Line = 0; // 0-based
+static constexpr int LineBits = 20;
+static constexpr int ColumnBits = 12;

hokein wrote:
> sammccall wrote:
> > (not sure these constants are needed as it stands - YAML shouldn't use 
> > them, see below)
> I think we need, for testing, for setters, rather than using magic number.
In that case I'd suggest static constexpr `MaxLine = 1 << 20 - 1`, `MaxColumn = 
1 << 12 - 1`, as that's what the tests (and potentially some warning checks in 
the code later?) need, so we encapsulate a little more detail. But up to you.

(Writing 20 twice a couple of lines apart seems fine to me)



Comment at: clangd/index/Index.h:46
 // Using UTF-16 code units.
-uint32_t Column = 0; // 0-based
+uint32_t Column : ColumnBits; // 0-based
   };

hokein wrote:
> sammccall wrote:
> > If we overflow the columns, it would be much easier to recognize if the 
> > result is always e.g. 255, rather than an essentially random number from 
> > 0-255 (from modulo 256 arithmetic).
> > 
> > This would mean replacing the raw fields with setters/getters, which is 
> > obviously a more invasive change. What about a compromise: add the setters, 
> > and call them from the most important codepaths: `SymbolCollector` and 
> > `Serialization`.
> It seems reasonable to use maximum value if we encounter overflows.
> 
> > This would mean replacing the raw fields with setters/getters, which is 
> > obviously a more invasive change. What about a compromise: add the setters, 
> > and call them from the most important codepaths: SymbolCollector and 
> > Serialization.
> 
> Actually, there is not too much usage of the raw fields in the source code, 
> I'd prefer to use all setters/getters in all places (and it would make the 
> code consistent and more safe). How about this plan?
> 
> 1. I update the patch to change all raw fields usage to getters/setters, and 
> keep these raw fields public
> 2. do a cleanup internally
> 3. make these raw fields private
SGTM!



Comment at: clangd/index/Index.h:38
+  // Position is encoded into 32 bits to save space.
+  // If Line/Column is overflow, the value will be their maximum value.
   struct Position {

nit: no "is", overflow is a verb



Comment at: clangd/index/Index.h:41
+void setLine(uint32_t Line);
+uint32_t line() const;
+void setColumn(uint32_t Column);

nit: define the trivial getters here so the compiler can always inline them.
(Setters seem fine out-of-line)



Comment at: clangd/index/Index.h:66
const SymbolLocation::Position ) {
-  return std::tie(L.Line, L.Column) == std::tie(R.Line, R.Column);
+  return std::make_tuple(L.line(), L.column()) ==
+ std::make_tuple(R.Line, R.Column);

Why only using the getters on one side?



Comment at: clangd/index/YAMLSerialization.cpp:98
 
-template <> struct MappingTraits {
-  static void mapping(IO , SymbolLocation::Position ) {
-IO.mapRequired("Line", Value.Line);
-IO.mapRequired("Column", Value.Column);
+template <> struct MappingTraits> {
+  static void mapping(IO , std::pair ) {

I don't think specializing the traits for `pair` is OK, 
specializing for common types we don't own is asking for ODR violations.

prefer to define an custom struct for this (in fact you do, so just map that 
struct instead of struct->P?)



Comment at: clangd/index/YAMLSerialization.cpp:104-105
 };
 
+struct NormalizedPosition {
+  using Position = clang::clangd::SymbolLocation::Position;

please document why this normalization (YamlIO can't directly map bitfields)



Comment at: clangd/index/YAMLSerialization.cpp:109
+  NormalizedPosition(IO &, const Position ) {
+static_assert(
+sizeof(Position) == sizeof(uint32_t),

no need for this assert


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D53363



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D53363: [clangd] Encode Line/Column as a 32-bits integer.

2018-10-17 Thread Haojian Wu via Phabricator via cfe-commits
hokein added a comment.

In https://reviews.llvm.org/D53363#1267628, @sammccall wrote:

> (I think your math is off in the description: 20 bits should be 1M lines, not 
> 4M)


Oops...Update the desccription.

> I think this is a win, as I think truncation will be rare and not terrible. 
> We should document the intentions around truncation though.
> 
> Incidentally, this means replacing just the StringRef in 
> SymbolLocation::Position with a char* would save another 13%, because that's 
> also 8 bytes.

Yeah, I have a rough patch for it, using char* will save us ~50MB memory, which 
will lead to ~300 MB memory usage in total.

> (Obviously we'd probably replace the other StringRefs too, but it gives a 
> lower bound).






Comment at: clangd/index/Index.h:41
   struct Position {
-uint32_t Line = 0; // 0-based
+static constexpr int LineBits = 20;
+static constexpr int ColumnBits = 12;

sammccall wrote:
> (not sure these constants are needed as it stands - YAML shouldn't use them, 
> see below)
I think we need, for testing, for setters, rather than using magic number.



Comment at: clangd/index/Index.h:46
 // Using UTF-16 code units.
-uint32_t Column = 0; // 0-based
+uint32_t Column : ColumnBits; // 0-based
   };

sammccall wrote:
> If we overflow the columns, it would be much easier to recognize if the 
> result is always e.g. 255, rather than an essentially random number from 
> 0-255 (from modulo 256 arithmetic).
> 
> This would mean replacing the raw fields with setters/getters, which is 
> obviously a more invasive change. What about a compromise: add the setters, 
> and call them from the most important codepaths: `SymbolCollector` and 
> `Serialization`.
It seems reasonable to use maximum value if we encounter overflows.

> This would mean replacing the raw fields with setters/getters, which is 
> obviously a more invasive change. What about a compromise: add the setters, 
> and call them from the most important codepaths: SymbolCollector and 
> Serialization.

Actually, there is not too much usage of the raw fields in the source code, I'd 
prefer to use all setters/getters in all places (and it would make the code 
consistent and more safe). How about this plan?

1. I update the patch to change all raw fields usage to getters/setters, and 
keep these raw fields public
2. do a cleanup internally
3. make these raw fields private



Comment at: clangd/index/YAMLSerialization.cpp:97
 
-template <> struct MappingTraits {
-  static void mapping(IO , SymbolLocation::Position ) {
-IO.mapRequired("Line", Value.Line);
-IO.mapRequired("Column", Value.Column);
+struct NormalizedPosition {
+  using Position = clang::clangd::SymbolLocation::Position;

sammccall wrote:
> I don't think there's any reason to pack the YAML encoding.

The YAML here is a bit tricky, the previous code could not compile because we 
can not bind bit-field to Non-const references, I added another traits to keep 
the original YAML encoding (which is more readable).


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D53363



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D53363: [clangd] Encode Line/Column as a 32-bits integer.

2018-10-17 Thread Haojian Wu via Phabricator via cfe-commits
hokein updated this revision to Diff 170001.
hokein marked 3 inline comments as done.
hokein added a comment.

Address review comments:

- handle overflowed cases, and added tests
- add getter/setters for line/column and clear all call sides


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D53363

Files:
  clangd/FindSymbols.cpp
  clangd/XRefs.cpp
  clangd/index/Index.cpp
  clangd/index/Index.h
  clangd/index/Serialization.cpp
  clangd/index/SymbolCollector.cpp
  clangd/index/YAMLSerialization.cpp
  unittests/clangd/FileIndexTests.cpp
  unittests/clangd/IndexTests.cpp
  unittests/clangd/SymbolCollectorTests.cpp

Index: unittests/clangd/SymbolCollectorTests.cpp
===
--- unittests/clangd/SymbolCollectorTests.cpp
+++ unittests/clangd/SymbolCollectorTests.cpp
@@ -63,19 +63,19 @@
   return (arg.IncludeHeader == IncludeHeader) && (arg.References == References);
 }
 MATCHER_P(DeclRange, Pos, "") {
-  return std::tie(arg.CanonicalDeclaration.Start.Line,
-  arg.CanonicalDeclaration.Start.Column,
-  arg.CanonicalDeclaration.End.Line,
-  arg.CanonicalDeclaration.End.Column) ==
- std::tie(Pos.start.line, Pos.start.character, Pos.end.line,
-  Pos.end.character);
+  return std::make_tuple(arg.CanonicalDeclaration.Start.line(),
+ arg.CanonicalDeclaration.Start.column(),
+ arg.CanonicalDeclaration.End.line(),
+ arg.CanonicalDeclaration.End.column()) ==
+ std::make_tuple(Pos.start.line, Pos.start.character, Pos.end.line,
+ Pos.end.character);
 }
 MATCHER_P(DefRange, Pos, "") {
-  return std::tie(arg.Definition.Start.Line,
-  arg.Definition.Start.Column, arg.Definition.End.Line,
-  arg.Definition.End.Column) ==
- std::tie(Pos.start.line, Pos.start.character, Pos.end.line,
-  Pos.end.character);
+  return std::make_tuple(
+ arg.Definition.Start.line(), arg.Definition.Start.column(),
+ arg.Definition.End.line(), arg.Definition.End.column()) ==
+ std::make_tuple(Pos.start.line, Pos.start.character, Pos.end.line,
+ Pos.end.character);
 }
 MATCHER_P(RefCount, R, "") { return int(arg.References) == R; }
 MATCHER_P(ForCodeCompletion, IsIndexedForCodeCompletion, "") {
@@ -86,10 +86,10 @@
 MATCHER(RefRange, "") {
   const Ref  = testing::get<0>(arg);
   const Range  = testing::get<1>(arg);
-  return std::tie(Pos.Location.Start.Line, Pos.Location.Start.Column,
-  Pos.Location.End.Line, Pos.Location.End.Column) ==
- std::tie(Range.start.line, Range.start.character, Range.end.line,
-  Range.end.character);
+  return std::make_tuple(Pos.Location.Start.line(), Pos.Location.Start.column(),
+ Pos.Location.End.line(), Pos.Location.End.column()) ==
+ std::make_tuple(Range.start.line, Range.start.character,
+ Range.end.line, Range.end.character);
 }
 testing::Matcher &>
 HaveRanges(const std::vector Ranges) {
Index: unittests/clangd/IndexTests.cpp
===
--- unittests/clangd/IndexTests.cpp
+++ unittests/clangd/IndexTests.cpp
@@ -16,6 +16,7 @@
 #include "index/Merge.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include 
 
 using testing::_;
 using testing::AllOf;
@@ -31,13 +32,28 @@
 
 MATCHER_P(Named, N, "") { return arg.Name == N; }
 MATCHER_P(RefRange, Range, "") {
-  return std::tie(arg.Location.Start.Line, arg.Location.Start.Column,
-  arg.Location.End.Line, arg.Location.End.Column) ==
- std::tie(Range.start.line, Range.start.character, Range.end.line,
-  Range.end.character);
+  return std::make_tuple(arg.Location.Start.line(), arg.Location.Start.column(),
+ arg.Location.End.line(), arg.Location.End.column()) ==
+ std::make_tuple(Range.start.line, Range.start.character,
+ Range.end.line, Range.end.character);
 }
 MATCHER_P(FileURI, F, "") { return arg.Location.FileURI == F; }
 
+TEST(SymbolLocation, Position) {
+  using Position = SymbolLocation::Position;
+  Position Pos;
+
+  Pos.setLine(1);
+  EXPECT_EQ(1u, Pos.line());
+  Pos.setColumn(2);
+  EXPECT_EQ(2u, Pos.column());
+
+  Pos.setLine((1 << Position::LineBits) + 1); // overflow
+  EXPECT_EQ(Pos.line(), (1u << Position::LineBits) - 1);
+  Pos.setColumn((1 << Position::ColumnBits) + 1); // overflow
+  EXPECT_EQ(Pos.column(), (1u << Position::ColumnBits) - 1);
+}
+
 TEST(SymbolSlab, FindAndIterate) {
   SymbolSlab::Builder B;
   B.insert(symbol("Z"));
Index: unittests/clangd/FileIndexTests.cpp
===
--- unittests/clangd/FileIndexTests.cpp
+++ unittests/clangd/FileIndexTests.cpp
@@ -32,10 +32,10 @@
 

[PATCH] D53363: [clangd] Encode Line/Column as a 32-bits integer.

2018-10-17 Thread Sam McCall via Phabricator via cfe-commits
sammccall added a comment.

(I think your math is off in the description: 20 bits should be 1M lines, not 
4M)
I think this is a win, as I think truncation will be rare and not terrible. We 
should document the intentions around truncation though.

Incidentally, this means replacing just the StringRef in 
SymbolLocation::Position with a char* would save another 13%, because that's 
also 8 bytes.
(Obviously we'd probably replace the other StringRefs too, but it gives a lower 
bound).




Comment at: clangd/index/Index.h:37
+  //
+  // Position is encoded as a 32-bits integer like below:
+  // |<--- 20 bits --->|<-12 bits->|

I think it's enough to say "position is packed into 32 bits to save space".

It'd be useful to spell out the consequences in a second line.



Comment at: clangd/index/Index.h:41
   struct Position {
-uint32_t Line = 0; // 0-based
+static constexpr int LineBits = 20;
+static constexpr int ColumnBits = 12;

(not sure these constants are needed as it stands - YAML shouldn't use them, 
see below)



Comment at: clangd/index/Index.h:46
 // Using UTF-16 code units.
-uint32_t Column = 0; // 0-based
+uint32_t Column : ColumnBits; // 0-based
   };

If we overflow the columns, it would be much easier to recognize if the result 
is always e.g. 255, rather than an essentially random number from 0-255 (from 
modulo 256 arithmetic).

This would mean replacing the raw fields with setters/getters, which is 
obviously a more invasive change. What about a compromise: add the setters, and 
call them from the most important codepaths: `SymbolCollector` and 
`Serialization`.



Comment at: clangd/index/YAMLSerialization.cpp:97
 
-template <> struct MappingTraits {
-  static void mapping(IO , SymbolLocation::Position ) {
-IO.mapRequired("Line", Value.Line);
-IO.mapRequired("Column", Value.Column);
+struct NormalizedPosition {
+  using Position = clang::clangd::SymbolLocation::Position;

I don't think there's any reason to pack the YAML encoding.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D53363



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D53363: [clangd] Encode Line/Column as a 32-bits integer.

2018-10-17 Thread Haojian Wu via Phabricator via cfe-commits
hokein created this revision.
hokein added a reviewer: sammccall.
Herald added subscribers: kadircet, arphaman, jkorous, MaskRay, ioeric, 
ilya-biryukov.

This would buy us more memory. Using a 32-bits integer is enough for
most human-readable source code (up to 4M lines and 4K columns).

Previsouly, we used 8 bytes for a position, now 4 bytes, it would save
us 8 bytes for each Ref and each Symbol instance.

For LLVM-project binary index file, we save ~13% memory.

| Before | After |
| 412MB  | 355MB |


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D53363

Files:
  clangd/index/Index.h
  clangd/index/YAMLSerialization.cpp
  unittests/clangd/FileIndexTests.cpp
  unittests/clangd/SerializationTests.cpp

Index: unittests/clangd/SerializationTests.cpp
===
--- unittests/clangd/SerializationTests.cpp
+++ unittests/clangd/SerializationTests.cpp
@@ -33,12 +33,8 @@
   Lang:Cpp
 CanonicalDeclaration:
   FileURI:file:///path/foo.h
-  Start:
-Line: 1
-Column: 0
-  End:
-Line: 1
-Column: 1
+  Start: 4096 # Line 1, Column 0
+  End: 4097 # Line 1, Column 1
 Origin:4
 Flags:1
 Documentation:'Foo doc'
@@ -59,12 +55,8 @@
   Lang:Cpp
 CanonicalDeclaration:
   FileURI:file:///path/bar.h
-  Start:
-Line: 1
-Column: 0
-  End:
-Line: 1
-Column: 1
+  Start: 4096 # Line 1, Column 0
+  End: 4097 # Line 1, Column 1
 Flags:2
 Signature:'-sig'
 CompletionSnippetSuffix:'-snippet'
@@ -75,12 +67,8 @@
   - Kind: 4
 Location:
   FileURI:file:///path/foo.cc
-  Start:
-Line: 5
-Column: 3
-  End:
-Line: 5
-Column: 8
+  Start: 20483 # Line 5, Column 3
+  End: 20485 # Line5, Column 8
 )";
 
 MATCHER_P(ID, I, "") { return arg.ID == cantFail(SymbolID::fromStr(I)); }
Index: unittests/clangd/FileIndexTests.cpp
===
--- unittests/clangd/FileIndexTests.cpp
+++ unittests/clangd/FileIndexTests.cpp
@@ -32,10 +32,10 @@
 using testing::UnorderedElementsAre;
 
 MATCHER_P(RefRange, Range, "") {
-  return std::tie(arg.Location.Start.Line, arg.Location.Start.Column,
-  arg.Location.End.Line, arg.Location.End.Column) ==
- std::tie(Range.start.line, Range.start.character, Range.end.line,
-  Range.end.character);
+  return arg.Location.Start.Line == Range.start.line &&
+ arg.Location.Start.Column == Range.start.character &&
+ arg.Location.End.Line == Range.end.line &&
+ arg.Location.End.Column == Range.end.character;
 }
 MATCHER_P(FileURI, F, "") { return arg.Location.FileURI == F; }
 MATCHER_P(DeclURI, U, "") { return arg.CanonicalDeclaration.FileURI == U; }
Index: clangd/index/YAMLSerialization.cpp
===
--- clangd/index/YAMLSerialization.cpp
+++ clangd/index/YAMLSerialization.cpp
@@ -94,18 +94,37 @@
   uint8_t Origin = 0;
 };
 
-template <> struct MappingTraits {
-  static void mapping(IO , SymbolLocation::Position ) {
-IO.mapRequired("Line", Value.Line);
-IO.mapRequired("Column", Value.Column);
+struct NormalizedPosition {
+  using Position = clang::clangd::SymbolLocation::Position;
+  NormalizedPosition(IO &) {}
+  NormalizedPosition(IO &, const Position ) {
+static_assert(
+sizeof(Position) == sizeof(uint32_t),
+"SymbolLocation::Position structure can not fit into a uint32_t.");
+value = (Pos.Line << Position::ColumnBits) + Pos.Column;
   }
+
+  Position denormalize(IO &) {
+Position Pos;
+Pos.Line = value >> Position::ColumnBits;
+Pos.Column = value & ((1 << Position::ColumnBits) - 1);
+return Pos;
+  }
+
+  // Encode the SymbolLocation::Position:
+  // |  Line  | Column |
+  uint32_t value;
 };
 
 template <> struct MappingTraits {
   static void mapping(IO , SymbolLocation ) {
 IO.mapRequired("FileURI", Value.FileURI);
-IO.mapRequired("Start", Value.Start);
-IO.mapRequired("End", Value.End);
+MappingNormalization NStart(
+IO, Value.Start);
+IO.mapRequired("Start", NStart->value);
+MappingNormalization NEnd(
+IO, Value.End);
+IO.mapRequired("End", NEnd->value);
   }
 };
 
Index: clangd/index/Index.h
===
--- clangd/index/Index.h
+++ clangd/index/Index.h
@@ -33,10 +33,17 @@
 struct SymbolLocation {
   // Specify a position (Line, Column) of symbol. Using Line/Column allows us to
   // build LSP responses without reading the file content.
+  //
+  // Position is encoded as a 32-bits integer like below:
+  // |<--- 20 bits --->|<-12 bits->|
+  // |<---  Line   --->|<- Column->|
   struct Position {
-uint32_t Line = 0; // 0-based
+static constexpr int LineBits = 20;
+static constexpr int ColumnBits = 12;
+
+uint32_t Line : LineBits; // 0-based
 // Using UTF-16 code