westonpace commented on a change in pull request #12537:
URL: https://github.com/apache/arrow/pull/12537#discussion_r817330103



##########
File path: cpp/src/arrow/compute/exec/tpch_node.cc
##########
@@ -0,0 +1,3738 @@
+#include "arrow/compute/exec/tpch_node.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/future.h"
+#include "arrow/util/unreachable.h"
+
+#include <algorithm>
+#include <bitset>
+#include <cstring>
+#include <random>
+#include <vector>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <unordered_set>
+
+namespace arrow
+{
+    using internal::checked_cast;
+
+    namespace compute
+    {
+        class TpchText
+        {
+        public:
+            Status Init();
+            Result<Datum> GenerateComments(
+                size_t num_comments,
+                size_t min_length,
+                size_t max_length,
+                random::pcg32_fast &rng);
+
+        private:
+            void GenerateWord(size_t &offset, const char **words, size_t 
num_choices);
+            void GenerateNoun(size_t &offset);
+            void GenerateVerb(size_t &offset);
+            void GenerateAdjective(size_t &offset);
+            void GenerateAdverb(size_t &offset);
+            void GeneratePreposition(size_t &offset);
+            void GenerateAuxiliary(size_t &offset);
+            void GenerateTerminator(size_t &offset);
+
+            void GenerateNounPhrase(size_t &offset);
+            void GenerateVerbPhrase(size_t &offset);
+            void GeneratePrepositionalPhrase(size_t &offset);
+
+            void GenerateSentence(size_t &offset);
+
+            std::unique_ptr<Buffer> text_;
+            random::pcg32_fast rng_;
+            static constexpr size_t kTextBytes = 300 * 1024 * 1024; // 300 MB
+        };
+
+        class TpchTableGenerator
+        {
+        public:
+            using OutputBatchCallback = std::function<void(ExecBatch)>;
+            using FinishedCallback = std::function<void(int64_t)>;
+            using GenerateFn = std::function<Status(size_t)>;
+            using ScheduleCallback = std::function<Status(GenerateFn)>;
+            using AbortCallback = std::function<void()>;
+
+            virtual Status Init(
+                std::vector<std::string> columns,
+                int scale_factor,
+                int64_t batch_size) = 0;
+
+            virtual Status StartProducing(
+                size_t num_threads,
+                OutputBatchCallback output_callback,
+                FinishedCallback finished_callback,
+                ScheduleCallback schedule_callback) = 0;
+
+            void Abort(AbortCallback abort_callback)
+            {
+                bool expected = false;
+                if(done_.compare_exchange_strong(expected, true))
+                {
+                    abort_callback();
+                }
+            }
+
+            virtual std::shared_ptr<Schema> schema() const = 0;
+
+            virtual ~TpchTableGenerator() = default;
+
+        protected:
+            std::atomic<bool> done_ = { false };
+            std::atomic<int64_t> batches_generated_ = { 0 };
+        };
+
+        int GetNumDigits(int64_t x)
+        {
+            // This if statement chain is for MAXIMUM SPEED
+            /*
+              .,
+              .      _,'f----.._
+              |\ ,-'"/  |     ,'
+              |,_  ,--.      /
+              /,-. ,'`.     (_
+              f  o|  o|__     "`-.
+              ,-._.,--'_ `.   _.,-`
+              `"' ___.,'` j,-'
+              `-.__.,--'
+             */
+            // Source: 
https://stackoverflow.com/questions/1068849/how-do-i-determine-the-number-of-digits-of-an-integer-in-c

Review comment:
       Modification doesn't really matter from a legal perspective.  Derivative 
works are only allowed if you have rights to the source.
   
   That being said, we are in a terrible gray area here that I fear will become 
a can of worms and a waste of everyone's time.  Technically the only "proper" 
approach would be a [clean room 
approach](https://en.wikipedia.org/wiki/Clean_room_design) where @save-buffer 
describes what is needed to someone that has never seen the SO code and that 
person writes the code.  However, legally documenting such a process is a 
headache.
   
   In this case I think we are (at least ethically) in the clear.  The answer 
author has this statement on their profile page:
   
   > All code I post on Stack Overflow is covered by the "Do whatever the heck 
you want with it" licence, the full text of which is:
   > 
   >     Do whatever the heck you want with it.
   > 
   
   There are a few edits by other authors but none of which touches the code 
included here.  My opinion would be to proceed as is (but get rid of the ASCII 
art, I can't abide fun).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to