[GitHub] astonzhang commented on a change in pull request #8763: Add mxnet.text APIs

GitBox Mon, 08 Jan 2018 17:25:47 -0800

astonzhang commented on a change in pull request #8763: Add mxnet.text APIs
URL: https://github.com/apache/incubator-mxnet/pull/8763#discussion_r160299863


 ##########
 File path: python/mxnet/text/embedding.py
 ##########
 @@ -0,0 +1,600 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=consider-iterating-dictionary
+
+"""Text token embeddings."""
+from __future__ import absolute_import
+from __future__ import print_function
+
+import io
+import logging
+import os
+import tarfile
+import warnings
+import zipfile
+
+from . import constants as C
+from ..gluon.utils import download
+from .indexer import TokenIndexer
+from .. import ndarray as nd
+from .. import registry
+
+
+class TokenEmbedding(TokenIndexer):
+    """Text embedding base class.
+
+
+    To load text embeddings from an externally hosted pre-trained text 
embedding
+    file, such as those of GloVe and FastText, use
+    `TokenEmbedding.create(embedding_name, pretrained_file_name)`. To get all 
the
+    available `embedding_name` and `pretrained_file_name`, use
+    `TokenEmbedding.get_embedding_and_pretrained_file_names()`.
+
+    Alternatively, to load embedding vectors from a custom pre-trained text
+    embedding file, use :func:`~mxnet.text.embeddings.CustomEmbedding`.
+
+    For every unknown token, if its representation `self.unknown_token` is
+    encountered in the pre-trained text embedding file, index 0 of
+    `self.idx_to_vec` maps to the pre-trained text embedding vector loaded from
+    the file; otherwise, index 0 of `self.idx_to_vec` maps to the text 
embedding
+    vector initialized by `init_unknown_vec`.
+
+    If a token is encountered multiple times in the pre-trained text embedding
+    file, only the first-encountered text embedding vector will be loaded and
+    the rest will be skipped.
+
+    For the same token, its index and embedding vector may vary across 
different
+    instances of :func:`~mxnet.text.embedding.TokenEmbedding`.
+
+
+    Properties
+    ----------
+    vec_len : int
+        The length of the embedding vector for each token.
+    idx_to_vec : mxnet.ndarray.NDArray
+        For all the indexed tokens in this embedding, this NDArray maps each
+        token's index to an embedding vector. The largest valid index maps
+        to the initialized embedding vector for every reserved token, such as 
an
+        unknown_token token and a padding token.
+    """
+
+    def __init__(self, **kwargs):
+        super(TokenEmbedding, self).__init__(**kwargs)
+
+    @classmethod
+    def _get_pretrained_file_path_from_url(cls, url, embedding_root,
+                                           pretrained_file_name):
+        """Get the local path to the pre-trained text embedding file from url.
+
+
+        The pre-trained embedding file will be downloaded from url if it has 
not
+        been downloaded yet or the existing file fails to match its expected
+        SHA-1 hash.
+        """
+
+        embedding_cls = cls.__name__.lower()
+        embedding_root = os.path.expanduser(embedding_root)
+
+        embedding_dir = os.path.join(embedding_root, embedding_cls)
+        pretrained_file_path = os.path.join(embedding_dir, 
pretrained_file_name)
+        downloaded_file = os.path.basename(url)
+        downloaded_file_path = os.path.join(embedding_dir, downloaded_file)
+
+        expected_file_hash = 
cls.pretrained_file_name_sha1[pretrained_file_name]
+
+        if hasattr(cls, 'pretrained_archive_name_sha1'):
+            expected_downloaded_hash = \
+                cls.pretrained_archive_name_sha1[downloaded_file]
+        else:
+            expected_downloaded_hash = expected_file_hash
+
+        # If downloaded_file_path exists and matches expected_downloaded_hash,
+        # there is no need to download.
+        download(url, downloaded_file_path, sha1_hash=expected_downloaded_hash)
+
+        ext = os.path.splitext(downloaded_file)[1]
+        if ext == '.zip':
+            with zipfile.ZipFile(downloaded_file_path, 'r') as zf:
+                zf.extractall(embedding_dir)
+        elif ext == '.gz':
+            with tarfile.open(downloaded_file_path, 'r:gz') as tar:
+                tar.extractall(path=embedding_dir)
+        return pretrained_file_path
+
+    def _load_embedding(self, pretrained_file_path, elem_delim,
+                        init_unknown_vec, encoding='utf8'):
+        """Load embedding vectors from the pre-trained text embedding file.
+
+
+        For every unknown token, if its representation `self.unknown_token` is
+        encountered in the pre-trained text embedding file, index 0 of
+        `self.idx_to_vec` maps to the pre-trained text embedding vector loaded
+        from the file; otherwise, index 0 of `self.idx_to_vec` maps to the text
+        embedding vector initialized by `init_unknown_vec`.
+
+        If a token is encountered multiple times in the pre-trained text
+        embedding file, only the first-encountered text embedding vector will 
be
+        loaded and the rest will be skipped.
+        """
+
+        pretrained_file_path = os.path.expanduser(pretrained_file_path)
+
+        if not os.path.isfile(pretrained_file_path):
+            raise ValueError('`pretrained_file_path` must be a valid path to '
+                             'the pre-trained text embedding file.')
+
+        with io.open(pretrained_file_path, 'r', encoding=encoding) as f:
+            lines = f.readlines()
+
+        logging.info('Loading pre-trained text embedding vectors from %s',
+                     pretrained_file_path)
+
+        vec_len = None
+        all_elems = []
+        tokens = set()
+        loaded_unknown_vec = None
+        line_num = 0
+        for line in lines:
+            line_num += 1
+            elems = line.rstrip().split(elem_delim)
+
+            assert len(elems) > 1, 'At line %d of the pre-trained text ' \
+                                   'embedding file: the data format of the ' \
+                                   'pre-trained text embedding file %s is ' \
+                                   'unexpected.' \
+                                   % (line_num, pretrained_file_path)
+
+            token, elems = elems[0], [float(i) for i in elems[1:]]
+
+            if token == self.unknown_token and loaded_unknown_vec is None:
+                loaded_unknown_vec = elems
+                tokens.add(self.unknown_token)
+            elif token in tokens:
+                warnings.warn('At line %d of the pre-trained text embedding '
+                              'file: the embedding vector for token %s has '
+                              'been loaded and a duplicate embedding for the '
+                              'same token is seen and skipped.'
+                              % (line_num, token))
+            elif len(elems) == 1:
+                warnings.warn('At line %d of the pre-trained text '
+                              'embedding file: token %s with 1-dimensional '
+                              'vector %s is likely a header and is '
+                              'skipped.' % (line_num, token, elems))
+            else:
+                if vec_len is None:
+                    vec_len = len(elems)
+                    # Reserve a vector slot for the unknown token at the
+                    # very beggining because the unknown index is 0.
+                    all_elems.extend([0] * vec_len)
+                else:
+                    assert len(elems) == vec_len, \
+                        'At line %d of the pre-trained text embedding ' \
+                        'file: the dimension of token %s is %d but the ' \
+                        'dimension of previous tokens is %d. Dimensions ' \
+                        'of all the tokens must be the same.' \
+                        % (line_num, token, len(elems), vec_len)
+                all_elems.extend(elems)
+                self._idx_to_token.append(token)
+                self._token_to_idx[token] = len(self._idx_to_token) - 1
+                tokens.add(token)
+
+        self._vec_len = vec_len
+        self._idx_to_vec = nd.array(all_elems).reshape((-1, self.vec_len))
+
+        if loaded_unknown_vec is None:
+            self._idx_to_vec[C.UNKNOWN_IDX] = init_unknown_vec(
 
 Review comment:
   constants.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] astonzhang commented on a change in pull request #8763: Add mxnet.text APIs

Reply via email to