Here 'tis.

On Thursday, April 23, 2020 at 3:22:17 PM UTC-4, Edward K. Ream wrote:
>
>
>
> On Thu, Apr 23, 2020 at 1:52 PM Thomas Passin <[email protected] 
> <javascript:>> wrote:
>
>> Here's another one of the old files.  
>>
>
> Please send the external file. I need the external file to be able to see 
> the crash. Without it, the .leo file loads, with lots of "recovered nodes".
>
> Edward
>

-- 
You received this message because you are subscribed to the Google Groups 
"leo-editor" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/leo-editor/21078b4d-cb93-4442-9792-4f841526659e%40googlegroups.com.
#@+leo-ver=4-thin
#@+node:tom.20060721125850.5:@thin lsa.py
#@+others
#@+node:tom.20060721125850.6:Imports
import string
import re
import sys
#@nonl
#@-node:tom.20060721125850.6:Imports
#@+node:tom.20060721125850.7:Stop Words
STOPWORDS = [
    'a', 'about', 'after', 'again', 'all', 'already', 'also', 'am','an', 
    'and', 'any', 'anyone', 'are',
    'as', 'at',
    'be', 'been', 'by', 'but',
    'can', 'com', 
    'd', 'did', 'do', "don t",
    'e',
    'from', 'for', 
    'he', 'has', 'have', 'his', 'hers', 'her', 'however',
    'i', 'if', 'in', 'into', 'is', 'it', 'its', 
    'll', 'let', 'ly',
     'm','me', 'might', 'my', 'mine',
     'new', 'not',
     'o', 'one', 'on', 'of' , 'or', 'org', 'over',
     're',
     's', 'say', 'says', 'should', 'shall', 'she', 'so',
     't', 'thank', 'thanks', 'the', 'them', 'their', 'these', 'they', 'those', 
     'the', 'then', 'this', 'that', 'when', 'you', 'your', 
     'through', 'to', 'two', 'use', 'using',
     've',
     'was', 'will', 'with', 'what', 'while', 'why', 'were', 'who', 'would', 'work',
     'www',
     'ma', 'ri'
]
#@nonl
#@-node:tom.20060721125850.7:Stop Words
#@+node:tom.20060721125850.8:Remove Stop Words From List
def removeStops(wordlist, stopwords):
    stopset = set(stopwords) # for faster lookup
    return [w.lower() for w in wordlist if w not in stopset]
#@nonl
#@-node:tom.20060721125850.8:Remove Stop Words From List
#@+node:tom.20060721125850.9:Count co-occurrances
#@+at 
# Routines to count co-occurrences of words within a given span (such as a 
# sentence).
# @others
#@-at
#@+node:tom.20060721125850.11:Count co-occurrences of words
#@+at
# def countCo_occurs(wordlist, high_value_list, co_count_dict):
#     '''co_count_dict is a dictionary to store the co-occurrence counts. Each
#     entry is keyed by a frozen set that contains the word pairs.
#     The "high value list" is a list of the special "high-valued" words that
#     occur in the same span as the wordlist words.'''
#     wordset = frozenset([word.lower() for word in wordlist])
#     hvset = frozenset(high_value_list) # Used for faster lookups than a list 
# would be
#     used = {}
#     for w in wordset:
#         if len(w) < 2 or  w in hvset: continue
# 
#         for hv in high_value_list:
#             #s = (w, hv)
#             s = (hv, w)
#             if s in used: continue
#             used[s] = True
#             if s not in co_count_dict:
#                 co_count_dict[s] = 0
#             co_count_dict[s] = co_count_dict[s] + 1
#@-at
#@nonl
#@-node:tom.20060721125850.11:Count co-occurrences of words
#@+node:tom.20060721130653:Count word occurrences in docs
#@+node:tom.20060721130653.1:Data Format
#@+at
#@@nocolor
# Input is a list of rows, where each row is a list of the words in a 
# "document"
# Output is a dictionary of unique words {word0:[count0, count1, ...], 
# word1:[...], ...}
# Here, count0 = number of occurrences of word in doc0, etc.
#@-at
#@nonl
#@-node:tom.20060721130653.1:Data Format
#@+node:tom.20060721130653.3:Count words
#@+at 
#@nonl
# Output: {word0:[count0, count1, ...], word1:[...], ...}
# Find list of unique words. For every word, process every line to count 
# occurrences
#@-at
#@@c
def _get_uniques(lines):
    uniques = set([])
    for line in lines:
        for word in line:
            uniques.add(word)
    wordlist = list(uniques)
    wordlist.sort()
    return wordlist

def countWords(lines):
    wordlist = _get_uniques(lines)
    result = {}
    for word in wordlist:
        result[word] = []
        for line in lines:
            count = 0
            for wd in line:
                if wd == word:
                    count = count + 1
            result[word].append(count)
    return result
    
#@nonl
#@-node:tom.20060721130653.3:Count words
#@-node:tom.20060721130653:Count word occurrences in docs
#@+node:tom.20060721151938:Compute cosine between two rows
# A vector is represented by a list of elements
def cosine(vector1, vector2, ignored = []):
    cos = 0
    norm1 = 0
    norm2  = 0
    for n in range(len(vector1)):
        if n in ignores: continue
        norm1 = norm1 + vector1[n]*vector1[n]
        norm2 = norm2 + vector2[n]*vector2[n]
        cos = cos + vector1[n] * vector2[n]
    if norm1 == 0 or norm2 == 0:
        cos = 0 # Must have been noisy data, set to 0
    else:
        cos = cos / math.sqrt(norm1 * norm2)
    
    return cos
#@nonl
#@-node:tom.20060721151938:Compute cosine between two rows
#@-node:tom.20060721125850.9:Count co-occurrances
#@+node:tom.20060721125850.12:Create output
#@+others
#@+node:tom.20060721125850.14:Matrix output for SVD
#@+others
#@+node:tom.20060721125850.15:Matrix format
#@@nocolor
#@+at
# 
# The SVD routine takes a matrix in this format:
#     a = [[22.,10., 2.,  3., 7.],
#     [14., 7.,10.,  0., 8.],
#     [-1.,13.,-1.,-11., 3.],
#     [-3.,-2.,13., -2., 4.],
#     [ 9., 8., 1., -2., 4.],
#     [ 9., 1.,-7.,  5.,-1.],
#     [ 2.,-6., 6.,  5., 1.],
#     [ 4., 5., 0., -2., 2.]]
# 
# There must be more rows than columns.
#@-at
#@nonl
#@-node:tom.20060721125850.15:Matrix format
#@+node:tom.20060721125850.16:makeMatrix
#@@nocolor
#@+at
# The Columns will be the "documents", each reprenseted by its line number.
# Each row will represent the number of times the row word occurs in the 
# column "doc".
# The data will be the output from countWords, which is a dictionary keyed by 
# the words.
# 
# First we have to make an lookup table of the words by index number.  Then we 
# can
# use this to load the matrix
# 
# We return the matrix and the word index
# 
#@@color
#@-at
#@@c
def makeMatrix(count_dict):
    keys = count_dict.keys()
    keys.sort()
    wordlist = []
    wordindex = {}
    for n in range(len(keys)):
        word = keys[n]
        wordlist.append(word)
        wordindex[n] = word
    
    matrix = []
    # Now we iterate over the words.  For each one, we append its list of counts to the matrix.
    for word in wordlist:
        matrix.append(count_dict[word])
        
    return matrix, wordindex, wordlist
#@nonl
#@-node:tom.20060721125850.16:makeMatrix
#@-others
#@nonl
#@-node:tom.20060721125850.14:Matrix output for SVD
#@+node:tom.20060721125850.17:print matrix
def showMatrix(m):
    str = matrixSize(m) + '\n'
    for r in range(len(m)):
        row = m[r]
        str += '%s [ ' % (r)
        for e in row:
            str +=  '%.2f\t' % e
        str += ']\n'
    str += '\n'
    return str
    
def matrixSize(m):
    return '%s rows, %s columns' % (len(m), len(m[0]))
#@nonl
#@-node:tom.20060721125850.17:print matrix
#@+node:tom.20060721125850.18:Matrix Multiply With Ignored Columns
def matrixMultiplyIgnores(a, b, ignores = []):
    # (AB)jk = (Sum over i) (Aji * Bik)
    # j -> row, k -> column
    
    a_cols = len(a[0])
    a_rows = len(a)
    b_cols = len(b[0])
    b_rows = len(b)
    
    if a_cols != b_rows or a_cols > b_cols:
        raise ValueError, 'matrixMultiplyIgnores error: array sizes do not match.'
    
    result = []
    for j in range(a_rows):
        row = []
        for k in range(b_rows):
            sum = 0
            for i in range(a_cols):
                if i not in ignores: sum = sum + a[j][i] * b[i][k]
            row.append(sum)
        result.append(row)
        
    return result
    
#@-node:tom.20060721125850.18:Matrix Multiply With Ignored Columns
#@-others
#@nonl
#@-node:tom.20060721125850.12:Create output
#@+node:tom.20060721125850.19:Show Command Line Usage Message
def usage():
    print 'Need two files on the command line.'
    print 'Usage -'
    print '\t', sys.argv[0], '<source filename>'
    print
#@nonl
#@-node:tom.20060721125850.19:Show Command Line Usage Message
#@+node:tom.20060721125850.20:Test
if __name__ == '__main__':
    import sys
    if len(sys.argv) < 2:
        usage()
        sys.exit(1)
        
    from pprint import pprint
    fname = sys.argv[1]    
    #@    @+others
    #@+node:tom.20060721125850.21:read and prepare word lists
    # read lines in the file into a list
    f = open(fname, 'r')
    lines = f.readlines()
    f.close()
    
    lines = [line.strip() for line in lines]
    #@-node:tom.20060721125850.21:read and prepare word lists
    #@+node:tom.20060721125850.22:count word occurrences
    # count word occurrences
    temp = []
    # Clean lines
    for line in lines:
        words = line.split()
        words = [w.lower() for w in words]
        # remove stopwords
        words = removeStops(words, STOPWORDS)
        temp.append(words)
    
    originals = lines    
    lines = temp
    countdict = countWords(lines)
    #@-node:tom.20060721125850.22:count word occurrences
    #@+node:tom.20060721125850.24:Create Matrix
    matrix, wordindex, wordlist = makeMatrix(countdict)
    #@-node:tom.20060721125850.24:Create Matrix
    #@+node:tom.20060721125850.25:Do SVD
    import svd
    import math
    u,w,v = svd.svd(matrix)
    #@-node:tom.20060721125850.25:Do SVD
    #@+node:tom.20060721125850.26:Display results
    print 'Largest value in "W" matrix: %0.2f' % max(w)
    thresh = .5#1.5
    print 'Ignore threshold: %s' % thresh
    
    # Collect rows/cols of w to zero out
    ignores = []
    for i in range(len(w)):
        #print w[i]
        if abs(w[i]) < thresh: ignores.append(i)
    
    
    temp = []
    for n in range(len(originals)):
        if n not in ignores:
            temp.append(originals[n])
    
    if temp:
        print ' %s retained lines:' % (len(originals) - len(ignores))
        for line in temp: print '  ' + line
        print
    
    print
    #@-node:tom.20060721125850.26:Display results
    #@+node:tom.20060721125850.27:Recover Matrix
    
    # Compute (what should be) the original matrix
    full_w_ignores = []
    
    for i in range(len(w)):
        row = []
        for j in range(len(w)):
            if i != j or i in ignores: row.append(0)
            else: row.append(w[i])
        full_w_ignores.append(row)
    
    full_w = []
    
    for i in range(len(w)):
        row = []
        for j in range(len(w)):
            if i != j: row.append(0)
            else: row.append(w[i])
        full_w.append(row)
    
    
    uv = matrixMultiplyIgnores(u, full_w)
    recovered_ignores = matrixMultiplyIgnores(uv, svd.transpose(v), ignores)
    
    recovered = matrixMultiplyIgnores(uv, svd.transpose(v))
    
    #@-node:tom.20060721125850.27:Recover Matrix
    #@+node:tom.20060721125850.28:Compute cosines between terms
    C = svd.matrixmultiply( recovered_ignores, svd.transpose(recovered_ignores))
    #print showMatrix( C)
    
    # Term vector norms
    norms = []
    for t in range(len(C)):
       norms.append(math.sqrt(C[t][t]))
       
    #@+at 
    #@nonl
    # Cosines of all terms
    #  C = Dotproducts =
    #  t1*t1 t1*t2 ...
    #  t2*t1 t2*t2 ...
    #  t3*t1 t3*t2 t3*t3
    #  .
    #  .
    #  .
    # 
    # For cosines, each dot product is normalized and squareroot-ified
    #@-at
    #@@c
    
    Cosines = []
    for row in range(len(C)):
        Cosines.append([])
        for col in range(len(C)):
            try:
                Cosines[row].append(math.sqrt(abs(C[row][col])) /math.sqrt((norms[row] * norms[col])))
            except Exception, e:
                print e, C[row][col], norms[row], norms[col]
    
    #@+others
    #@+node:tom.20060721125850.29:Print term similarity
    #@+at
    # for w in range(len(recovered_ignores)):
    #     #if w_lookup[w] == 'reptile':
    #         print 'norm of %s: %s' % (w_lookup[w], norms[w])
    #         row = recovered_ignores[w]
    #         vector = []
    #         for c in range(len(row)):
    #             vector.append( (hv_lookup[c], abs(row[c])))
    #         vector.sort()
    #         vector.reverse()
    #         for concept in vector:
    #             print '  %s, %s' % (concept[1], concept[0])
    #@-at
    #@+at
    # for r in range(len(Cosines)):
    #     if norms[r] < 0.01: continue
    # 
    #     row = Cosines[r]
    #     word = w_lookup[r]
    #     words = []
    #     print '%s: ' % word
    #     for col in range(len(row)):
    #         if col == r: continue
    #         if norms[col] < 0.01: continue
    #         if row[col] > 0.5: # Minimum similarity threshold
    #             words.append((row[col], w_lookup[col]))
    #     words.sort(); words.reverse()
    #     for w in words:
    #         print '   %0.2f %s' % (w[0], w[1])
    #     print
    #@-at
    #@nonl
    #@-node:tom.20060721125850.29:Print term similarity
    #@+node:tom.20060721152707:Term similarity for two specific terms
    #@+at
    # # Find words to compare
    # word1 = 'jazz'
    # word2 = 'music'
    # n1 = -1
    # n2 = -1
    # 
    # for n in range(len(wordlist)):
    #     if wordlist[n] == word1:
    #         n1 = n
    #         break
    # for n in range(len(wordlist)):
    #     if wordlist[n] == word2:
    #         n2 = n
    #         break
    # 
    # if n1 == -1:
    #     print "Can't find '%s'" % word1
    #     sys.exit(0)
    # if n2 == -1:
    #     print "Can't find '%s'" % word2
    #     sys.exit(0)
    # print '%s, %s %0.2f' % (word1, word2, cosine(matrix[n1], 
    # matrix[n2])              )
    #@-at
    #@nonl
    #@-node:tom.20060721152707:Term similarity for two specific terms
    #@+node:tom.20060721125850.30:Term Similarity to Concepts
    #@@c
    results = []
    for r in range(len(recovered_ignores)):
        temp = []
        if norms[r] < 0.01: continue # skip really bad fits (i.e., large errors)
    
        row = recovered_ignores[r]
        word = wordindex[r]
        words = []
     
        for col in range(len(row)):
            if col == r: continue 
            if row[col] > 0.3 * norms[r]: # Minimum similarity threshold
                words.append((row[col]/norms[r], originals[col]))
        words.sort(); words.reverse()
        
        for w in words:
            temp.append( '   %0.2f %s' % (w[0], w[1]))
        results.append((word,temp))
    
    results.sort()
    for r in results:
        if not len(r[1]): continue
        
        print r[0]
        for item in r[1]:
            print item
        print
        
       
    #@nonl
    #@-node:tom.20060721125850.30:Term Similarity to Concepts
    #@+node:tom.20060721125850.31:Concept similarity to Terms
    #@+at 
    #@nonl
    # Same as Term Similarity to Concepts, but transpose matrix first
    # (Thus switching terms and concepts)
    # 
    #@-at
    #@@c
    trans = svd.transpose(recovered_ignores)
    # Transposed norms
    C = svd.matrixmultiply( trans, svd.transpose(trans))
    #@+at
    # # term vector norms
    # norms = []
    # for t in range(len(C)):
    #    norms.append(math.sqrt(C[t][t]))
    # 
    # 
    # results = []
    # for r in range(len(trans)):
    #     temp = []
    #     if norms[r] < 0.01: continue # skip really bad fits (i.e., large 
    # errors)
    # 
    #     row = trans[r]
    #     word = string.join(originals[r])
    #     words = []
    #     for col in range(len(row)):
    #         if col == r: continue
    #         if row[col] > 0.3 * norms[r]: # Minimum similarity threshold
    #             words.append((row[col]/norms[r], w_lookup[col]))
    #     words.sort(); words.reverse()
    #     for w in words:
    #         temp.append( '   %0.2f %s' % (w[0], w[1]))
    #     results.append((word,temp))
    # 
    # results.sort()
    # for r in results:
    #     if not len(r[1]): continue
    #     print r[0]
    #     for item in r[1]:
    #         print item
    #     print
    # @others
    #@-at
    #@+node:tom.20060721125850.32:Concept - concept similarity
    
    # cosines between concepts
     # Each row represents one "concept"
    results = []
    for r in range(len(trans)):
    
        row = trans[r]
        word = originals[r]
    
        for r2 in range(r+1, len(trans)):
            row2 = trans[r2]
            word2 = originals[r2]
            
            # Dot product
            dot = 0
            norm1 = 0
            norm2 = 0
            for n in range(len(row)):
                dot = dot + row[n] * row2[n]
                norm1 = norm1 + row[n] * row[n]
                norm2 = norm2 + row2[n] * row2[n]
            if norm1 == 0 or norm2 == 0:
                dot = 0
            else:
                dot = dot/math.sqrt(norm1 * norm2)
            if abs(dot) > 0.2: 
                temp = [word, word2]
                temp.sort()
                w1 = temp[0]; w2 = temp[1]
                results.append('%0.2f   %s - %s ' % (dot, w1, w2))
            
    results.sort()
    results.reverse()
    for r in results: print r
    
    #@+at
    # # print two concept vectors
    # print 'V1: %s' % (hv_lookup[1])
    # for cell in trans[1]:
    #     print cell
    # print 'V2: %s' % (hv_lookup[2])
    # for cell in trans[2]:
    #     print cell
    #@-at
    #@nonl
    #@-node:tom.20060721125850.32:Concept - concept similarity
    #@-node:tom.20060721125850.31:Concept similarity to Terms
    #@-others
    #@nonl
    #@-node:tom.20060721125850.28:Compute cosines between terms
    #@+node:tom.20060721125850.33:Check words
    #@+at
    # rows = {}
    # 
    # for r in range(len(recovered_ignores)):
    #     row = recovered_ignores[r]
    #     str = ''
    #     word = w_lookup[r]
    #     n = 0
    #     for i in range(len(row)):
    #         eps = 0.1
    #         val = row[i]
    #         if abs(val) > eps:
    #             n = n + 1
    #             if n > 1 and val >= 0: str += ' + '
    #             str += '%.3f * %s ' % (val, hv_lookup[i])
    #     str += '\n'
    #     row = recovered[r]
    #     n = 0
    #     for i in range(len(row)):
    #         eps = 0.1
    #         val = row[i]
    #         if abs(val) > eps:
    #             n = n + 1
    #             if n > 1 and val >= 0: str += ' + '
    #             str += '%.3f * %s ' % (val, hv_lookup[i])
    #     rows[word] = str
    # 
    # words = rows.keys()
    # words.sort()
    # for w in words:
    #     print w
    #     print rows[w]
    #     print
    #@-at
    #@nonl
    #@-node:tom.20060721125850.33:Check words
    #@-others
    
    
#@nonl
#@-node:tom.20060721125850.20:Test
#@-others
#@nonl
#@-node:tom.20060721125850.5:@thin lsa.py
#@-leo

Reply via email to