this is where i left off process.cpp

given that i'll need to figure out how to implement (or otherwise
handle) diffing files, possibly including moving them (might make
sense to use an existing git library) ;

it seems a better investment of time would be setting up adapters to
train embeddings. since generating data slowly is not a stopping
issue, but the model may not succeed at useful data without a slightly
better tokenizer.
#include <cstdint>
#include <iostream>
#include <unordered_map>
#include <sstream>
#include <string>
#include <vector>

using namespace std;

struct RawObject
{
    void read(istream & is)
    {
        uint64_t size;
        is >> hash >> type >> size;
        is.ignore(1); // newline after size
        data.resize(size);
        for (char * ptr = data.data(); ptr < &*data.end();) {
            is.read(ptr, &*data.end() - ptr);
            ptr += is.gcount();
        }
    }
    string hash, type;
    vector<char> data;
};

struct Commit
{
    Commit() {}
    void parse(RawObject const & obj)
    {
        istringstream ss(string(obj.data.begin(), obj.data.end()));

        header.clear();
        fields.clear();
        header.emplace_back("commit", obj.hash);

        static thread_local string temp;
        while ("reading commit header") {
            switch (ss.peek()) {
            default:
                header.resize(header.size() + 1);
                ss >> header.back().first; ss.ignore(1);
                getline(ss, header.back().second);
                continue;
            case ' ':
                ss.ignore(1); header.back().second += '\n';
                getline(ss, temp);
                header.back().second += temp;
                continue;
            case '\n':
                ss.ignore(1);
                goto end_of_header;
            }
        }

end_of_header:
        for (auto & item : header) {
            fields.emplace(item.first, &item.second);
        }
        message = string(istreambuf_iterator<char>(ss), {});
    }

    string const & operator[](string const & field)
    {
        return *fields.equal_range(field).first->second;
    }

    vector<string> all(string const & field)
    {
        vector<string> result;
        auto range = fields.equal_range(field);
        for (auto it = range.first; it != range.second; ++ it) {
            result.emplace_back(*it->second);
        }
        return result;
    }

    vector<pair<string, string>> header;
    unordered_multimap<string, string *> fields;
    string message;
};

struct Tree
{
    Tree() {}
    void parse(RawObject const & obj)
    {
        entries.clear();
        istringstream ss(string(obj.data.begin(), obj.data.end()));

        while (ss.tellg() < obj.data.size()) {
            entries.emplace_back();
            entries.back().parse(ss);
        }
    }
    struct Entry
    {
        Entry() {}
        void parse(istream & data)
        {
            getline(data, mode, ' ');
            getline(data, name, '\0');
            data.read(_hash, 20);
        }

        string mode;
        string name;
        string const & hash()
        {
            static thread_local string hex(40, 0);
            for (auto dst = &hex[0], src = _hash; dst != &*hex.end(); dst += 2, ++ src) {
                sprintf(dst, "%02x", (uint8_t)*src);
            }
            return hex;
        }

        char _hash[20];
    };
    vector<Entry> entries;
};

/* =========================================================== */

class Reader
{
public:
    Reader(istream & is)
    : is(is)
    { }
    void read()
    {
        static thread_local RawObject obj;

        obj.read(is);
        self.object(obj);
    }
protected:
    virtual void object(RawObject & obj)
    {
        static thread_local Commit commit;
        static thread_local Tree tree;
        if (obj.type == "commit")
        {
            commit.parse(obj);
            this->commit(obj, commit);
        }
        else if (obj.type == "tree")
        {
            tree.parse(obj);
            this->tree(obj, tree);
        }
        else if (obj.type == "blob")
        {
            this->blob(obj);
        }
        else
        {
            throw "not a commit tree or blob";
        }
    }
    virtual void commit(RawObject & obj, Commit & commit) 
    {
        cout << obj.type << " " << obj.data.size() << endl;
        for ( auto item : commit.fields ) {
            cout << item.first << ": " << *item.second << endl;
        }   
        cout << commit.message << endl;
    }
    virtual void tree(RawObject & obj, Tree & tree)
    {
        cout << obj.type << " " << obj.data.size() << endl;
        for ( auto & entry : tree.entries ) {
            cout << entry.name << ": " << entry.hash() << endl;
        }
    }
    virtual void blob(RawObject & obj)
    {
        cout << obj.type << " " << obj.hash << " " << obj.data.size() << endl;
    }

private:
    istream & is;
};

class DataGen : Reader
{
public:
    using Reader::Reader;

protected:
    virtual void object(RawObject & obj)
    {
        objects.emplace(obj.hash, obj);
        Reader::object(obj);
    }

    virtual void tree(RawObject & obj, Tree & tree)
    {
        auto & list = trees[obj.hash];
        for (Entry & entry : tree.entries) {
            list.emplace(entry.name, entry.hash());
        }
    }

    virtual void commit(RawObject & obj, Commit & commit)
    {
    }

    unordered_map<string, RawObject> objects;
    unordeded_map<string, unordered_map<string, string>> trees;

    vector<Commit> commit_queue;

    struct QueuedCommit
    {
        /* to output a commit, we at least need the full trees of its parents, also blobs of any changed files. */
            /* the diff information is not shown here, so it may make sense to make current setup work better. embedding training. */

    };
};

int main()
{
    Reader reader(cin);
    while (true) {
        reader.read();
    }
}

Reply via email to