gigasquid commented on a change in pull request #13865: Modifying clojure CNN 
text classification example
URL: https://github.com/apache/incubator-mxnet/pull/13865#discussion_r247353090
 
 

 ##########
 File path: 
contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj
 ##########
 @@ -21,53 +21,84 @@
             [org.apache.clojure-mxnet.context :as context]
             [org.apache.clojure-mxnet.ndarray :as ndarray]
             [org.apache.clojure-mxnet.random :as random])
-  (:import (java.io DataInputStream))
+  (:import (java.io DataInputStream)
+           (java.nio ByteBuffer ByteOrder))
   (:gen-class))
 
 (def w2v-file-path "../../data/GoogleNews-vectors-negative300.bin") ;; the 
word2vec file path
-(def max-vectors 100) ;; If you are using word2vec embeddings and you want to 
only load part of them
-
-(defn r-string [dis]
-  (let [max-size 50
-        bs (byte-array max-size)
-        sb (new StringBuilder)]
-    (loop [b (.readByte dis)
-           i 0]
-      (if (and (not= 32 b) (not= 10 b))
-        (do (aset bs i b)
-            (if (= 49 i)
-              (do (.append sb (new String bs))
-                  (recur (.readByte dis) 0))
-              (recur (.readByte dis) (inc i))))
-        (.append sb (new String bs 0 i))))
-    (.toString sb)))
-
-(defn get-float [b]
-  (-> 0
-      (bit-or (bit-shift-left (bit-and (aget b 0) 0xff) 0))
-      (bit-or (bit-shift-left (bit-and (aget b 1) 0xff) 8))
-      (bit-or (bit-shift-left (bit-and (aget b 2) 0xff) 16))
-      (bit-or (bit-shift-left (bit-and (aget b 3) 0xff) 24))))
+(def EOS "</s>")  ;; end of sentence word
+
+(defn glove-file-path
+  "Returns the file path to GloVe embedding of the input size"
+  [embedding-size]
+  (format "data/glove/glove.6B.%dd.txt" embedding-size))
+
+(defn r-string
+  "Reads a string from the given DataInputStream `dis` until a space or 
newline is reached."
+  [dis]
+  (loop [b (.readByte dis)
+         bs []]
+    (if (and (not= 32 b) (not= 10 b))
+      (recur (.readByte dis) (conj bs b))
+      (new String (byte-array bs)))))
+
+(defn get-float [bs]
+  (-> (ByteBuffer/wrap bs)
+      (.order ByteOrder/LITTLE_ENDIAN)
+      (.getFloat)))
 
 (defn read-float [is]
   (let [bs (byte-array 4)]
     (do (.read is bs)
         (get-float bs))))
 
-(defn load-google-model [path]
-  (println "Loading the word2vec model from binary ...")
-  (with-open [bis (io/input-stream path)
-              dis (new DataInputStream bis)]
-    (let [word-size (Integer/parseInt (r-string dis))
-          dim  (Integer/parseInt (r-string dis))
-          _  (println "Processing with " {:dim dim :word-size word-size} " 
loading max vectors " max-vectors)
-          word2vec (reduce (fn [r _]
-                             (assoc r (r-string dis)
-                                    (mapv (fn [_] (read-float dis)) (range 
dim))))
-                           {}
-                           (range max-vectors))]
-      (println "Finished")
-      {:num-embed dim :word2vec word2vec})))
+(defn- load-w2v-vectors
+  "Lazily loads the word2vec vectors given a data input stream `dis`,
 
 Review comment:
   Nice refactoring

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to