This is an automated email from the ASF dual-hosted git repository. joern pushed a commit to branch add_split in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit e763283aeb92cd7111b53c5328b104fd6a2556e7 Author: Jörn Kottmann <[email protected]> AuthorDate: Mon Aug 27 16:18:44 2018 +0200 Add split.py to split training data into pieces --- tf-ner-poc/src/main/python/namefinder/split.py | 60 ++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/tf-ner-poc/src/main/python/namefinder/split.py b/tf-ner-poc/src/main/python/namefinder/split.py new file mode 100644 index 0000000..6657bbf --- /dev/null +++ b/tf-ner-poc/src/main/python/namefinder/split.py @@ -0,0 +1,60 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import random +import sys + +def main(): + + if len(sys.argv) != 5: + print("Usage split.py data_file train_file dev_file test_file") + return + + train = [] + dev = [] + test = [] + + with open(sys.argv[1]) as f: + for line in f: + + if len(line.strip()) == 0: + continue + + rand = random.random() + if rand < 0.8: + train.append(line) + elif rand < 0.9: + dev.append(line) + elif rand <= 1.0: + test.append(line) + + with open(sys.argv[2], 'w') as f: + for item in train: + f.write("%s" % item) + + with open(sys.argv[3], 'w') as f: + for item in dev: + f.write("%s" % item) + + with open(sys.argv[4], 'w') as f: + for item in test: + f.write("%s" % item) + +if __name__ == "__main__": + main() \ No newline at end of file
