http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/climatology/clim/wls.py ---------------------------------------------------------------------- diff --git a/climatology/clim/wls.py b/climatology/clim/wls.py new file mode 100755 index 0000000..d7ed8a5 --- /dev/null +++ b/climatology/clim/wls.py @@ -0,0 +1,798 @@ +#!/usr/bin/env python +#----------------------------------------------------------------------------- +# Name: filelist.py +# Purpose: File listing class/functions. +# +# Author: Brian Wilson +# +# Created: Mon Apr 10 11:01:06 2006 +# Copyright: (c) 2006, California Institute of Technology. +# U.S. Government Sponsorship acknowledged. +#----------------------------------------------------------------------------- +# +USAGE = """ +filelist.py [--help] [--bottomUp] [--directory] [--delete] + [--fetchDir <outputDir>] [--fetchWitSubDirs] + [--list] [--matchUrl] --quiet] [--regex '.*\.[cC]'] + [--size] [--topOnly] [--url] + [--wildcard '*.txt.*'] [--xml] <topPaths ...> + +Recursively traverse and print (with full paths or URL's) all files +under the topPath(s) that match ANY of one or more regular expressions +and/or wildcard glob) strings. By default, it simply prints the matches, +but one can also get their sizes, fetch them, or delete them. + +The topPaths can be a mixture of local and remote (ftp or http) +paths, in which case a list of URL's is returned. If xml mode is +turned on, then the output is an XML list. + +If no regex or wildcard patterns are specified, then ALL files +are returned. If files are fetched, then the URL's are +REWRITTEN to point to the local copies. + +""" +# See the bottom of the file for exact switches and example of use. + +import sys, os, re, string, getopt, types, getpass +import urllib, urllib2, urlparse, time, shutil, socket, stat +from fnmatch import fnmatchcase +from ftplib import FTP +#import dataenc + +def matchAnyThenConstrain(root, name, haveRegs, regs, haveWilds, wildCards, + constraintFunction): + """Return True if the file name matches any of the compiled regular + expressions or any of the wildcard (glob) specs, and (if present) the + constraintFunction returns True. The regex can be a pair of match & + substitution patterns. The 'name' of the file might be altered by a + regex substitution and/or the constraintFunction. + """ + if not haveRegs and not haveWilds: + if constraintFunction is not None: + return constraintFunction(root, name) + else: + return (True, name) + else: + match = False + if haveRegs: + for reg in regs: + pattern, subst = reg + if pattern.search(name): + match = True + if subst: + name = pattern.sub(subst, name) + break + if haveWilds and not match: + for wild in wildCards: + if fnmatchcase(name, wild): + match = True + break + if match and constraintFunction is not None: + match, name = constraintFunction(root, name) + return (match, name) + + +# Users call this function +def filelist(urlPaths, regSpecs=[], wildCards=[], needCredentials=False, userCredentials=None, + matchFunction=matchAnyThenConstrain, constraintFunction=None, + matchUrl=False, walkDirectories=True, + urlMode=True, xmlMode=True, quietMode=False, verboseMode=False, getFileInfo=False, + fetchDir=None, fetchIfNewer=False, fetchWithSubDirs=False, + directoryMode=False, listMode=False, deleteMode=False, topDown=True, + stream=sys.stdout): + """Recursively traverse and print (with full paths or URL's) all files + under the topPath(s) that match one or more regular expressions and/or + wildcard (glob) strings, and an optional constraint (T/F) function to + further winnow the candidate matches. (The matchFunction can also be + entirely replaced with custom logic.) + + By default, it simply generates the matches, but one can also fetch them, + get their sizes, or delete them (if they are local files). + Handles local directory paths and ftp/http URL's. + + Returns three file lists: matched, actually fetched, & destination names. + """ + try: + matchedFiles = [] # source files that match criteria + fetchedFiles = [] # files that were actually fetched this run + destinationFiles = [] # destination (local) file names (rewritten URL) + + topPaths = [] + for url in urlPaths: + if url == '' or url == None: continue + remote, protocol, netloc, path = remoteUrl(url) + if not remote: url = os.path.abspath(url) + if url[-1] == '/': url = url[:-1] + topPaths.append(url) + + if needCredentials and userCredentials is None: + userCredentials = promptForCredentials(topPaths) + + if fetchDir: + workDir = os.path.join(fetchDir, '.tmp') + # fetch into tmp directory & then rename so fetching is atomic + try: os.mkdir(workDir) + except: pass + if not os.path.exists(workDir): + die("filelist: Cannot write to fetch directory %s" % fetchDir) + + if isinstance(topPaths, types.StringType): topPaths = [topPaths] + regSpecs = [s for s in regSpecs if s != '' and s != None] + wildCards = [s for s in wildCards if s != '' and s != None] + + haveRegs = False; regs = []; haveWilds = False; haveMatchFunction = False + if len(regSpecs) > 0: + haveRegs = True + regs = [] + for reg in regSpecs: + (pattern, subst) = parse_re_with_subst(reg) + regs.append( (re.compile(pattern), subst) ) + if len(wildCards) > 0: + haveWilds = True + + prefix = '' + extra = '' + suffix = '' + if deleteMode: + suffix += ' deleted.' + if '.' in topPaths: + die("filelist: Recursively deleting from the dot (.) path is not safe. Shame.") + + if directoryMode: listMode = False + if listMode: getFileInfo = True + if quietMode: stream = None + sumSizes = 0 + if xmlMode: + matchedFiles.append('<files>') + fetchedFiles.append('<files>') + _output('<files>', destinationFiles, stream) + prefix += ' <file>' + suffix += '</file>' + + for top in topPaths: + if verboseMode: warn('filelist: searching', top) + topMatchCount = 0; topFetchCount = 0 + + for root, dirs, files, infos in walk(top, userCredentials, walkDirectories, topDown): + if verboseMode: warn('filelist: found files in', root) + remote, protocol, netloc, path = remoteUrl(root) + if directoryMode: + contents = dirs + else: + contents = files + + for i in range(len(contents)): + line = '' + file = contents[i] + try: + info = infos[i] + except: + info = None + if matchUrl: + name = os.path.join(root, file) + else: + name = file + + match, newname = matchFunction(root, name, haveRegs, regs, + haveWilds, wildCards, constraintFunction) + if match: + line = '' + topMatchCount += 1 + fn = os.path.join(root, file) + + if getFileInfo or (fetchIfNewer and not remote): + if remote: + if info and getFileInfo: + if listMode: line = info.line + extra = ' ' + str(info.size) + ' ' + str(info.modTime) + sumSizes += info.size + else: + st = os.stat(fn) + line = ' '.join( map(str, \ + (st.st_mode, st.st_uid, st.st_gid, st.st_size, st.st_mtime, fn))) + info = FileInfo(line, st.st_size, st.st_mtime, st.st_uid, st.st_gid, st.st_mode) + if getFileInfo: + extra = ' ' + str(info.size) + ' ' + str(info.modTime) + sumSizes += info.size + + if not remote and urlMode: fn = makeFileUrl(fn) + matchedFiles.append(prefix + fn + extra + suffix) + + if matchUrl: + newfn = newname + else: + newfn = os.path.join(root, newname) + newr, newp, newloc, newpath = remoteUrl(newfn) + newfile = os.path.split(newpath)[1] + + if fetchDir: + if fetchDir == '.': fetchDir = os.getcwd() + if fetchWithSubDirs: + destDir = os.path.join(fetchDir, newpath[1:]) + else: + destDir = fetchDir + destFile = os.path.join(destDir, newfile) + tmpFile = os.path.join(workDir, newfile) + + if shouldFetch(remote, destFile, fetchIfNewer, info): + if not quietMode: + warn('filelist: Fetching ', fn) + warn('filelist: Writing ', destFile) + try: + os.makedirs(destDir) + except: + # kludge, makedirs throws exception if any part of path exists + pass + if remote: + urllib.urlretrieve(fn, tmpFile) + else: + shutil.copyfile(fn, tmpFile) + os.rename(tmpFile, destFile) # atomic rename of file into destDir + + topFetchCount += 1 + fetchedFiles.append(prefix + fn + suffix) + if getFileInfo: line = line + ' ' + destFile + + # now rewrite URL to point to local copy of file + fn = destFile + if not remote and urlMode: fn = makeFileUrl(fn) + + if not listMode: + line = prefix + fn + extra + suffix + _output(line, destinationFiles, stream) + if deleteMode: + if remote: + die('filelist: Cannot delete remote files (yet)') + else: + os.unlink(fn) + + if verboseMode and fetchDir: + warn('filelist: Matched %d files from %s' % (topMatchCount, top)) + warn('filelist: Fetched %d files from %s' % (topFetchCount, top)) + if fetchDir: + for f in os.listdir(workDir): os.remove(os.path.join(workDir, f)) + os.rmdir(workDir) + + if xmlMode: + matchedFiles.append('</files>') + fetchedFiles.append('</files>') + _output('<files>', destinationFiles, stream) + + if getFileInfo: + if xmlMode: + line = '<totalSize>%s</totalSize>' % sumSizes + else: + line = '#filelist: total size %s' % sumSizes + matchedFiles.append(line) + _output(line, destinationFiles, stream) + + except KeyboardInterrupt: + if fetchDir: + for f in os.listdir(workDir): os.remove(os.path.join(workDir, f)) + os.rmdir(workDir) + die('filelist: Keyboard Interrupt') + + return (matchedFiles, fetchedFiles, destinationFiles) + + +def shouldFetch(remote, destFile, fetchIfNewer, srcFileInfo): + if remote: + if os.path.exists(destFile): + doFetch = False + else: + doFetch = True + else: + if os.path.exists(destFile): + if fetchIfNewer: + destModTime = os.path.getmtime(destFile) + if destModTime < srcFileInfo.modTime: + doFetch = True + else: + doFetch = False + else: + doFetch = False + else: + doFetch = True + return doFetch + +def _output(line, lines, stream=None): + """Internal function: Add line to output lines and optionally print to stream.""" + lines.append(line) + if stream: print >>stream, line + +class FileInfo: + """Holder class for those file info. elements that are consistent among local + files (output of stat), ftp directories, http, etc. Minimum useful fields are + modification time and size. Line contains usual string output of ls -l. + """ + def __init__(self, line, size, modTime, userId=None, groupId=None, protectMode=None): + self.line=line; self.size=size; self.modTime=modTime + self.userId=userId; self.groupId=groupId; self.protectMode=protectMode + +class UserCredential(object): + """Container for user credential info. like username, password, certificate, etc. + """ + def __init__(self, username=None, password=None, validInterval=None, certificate=None): + self.username = username + self.password = password + self.validInterval = validInterval # tuple of Ints (days, hours, minutes) + if password is not None and validInterval is None: + die('UserCredential: If password is present, validInterval is also required.') + self.certificate = certificate + + def getPassword(self): + pw = self._password + if pw: + pw, daynumber, timestamp = dataenc.pass_dec(pw) + if dataenc.unexpired(daynumber, timestamp, self.validInterval): + return pw + else: + return None + else: + return None + def setPassword(self, pw): + if pw and pw != '': + self._password = dataenc.pass_enc(pw, daynumber=True, timestamp=True) + else: + self._password = pw + password = property(getPassword, setPassword) + +class UserCredentials: + """Contains dictionary of (url, credential) pairs and optionally an httpProxy. + """ + def __init__(self, httpProxy=None, credentials={}): + self.httpProxy = httpProxy + self.credentials = credentials + def add(self, url, credential): + self.credentials[url] = credential; return self + def forUrl(self, url): + for key in self.credentials: + if url.startswith(key): + return self.credentials[key] + return None + +def promptForCredentials(urls, httpProxy=None): + if httpProxy == None: + httpProxy = raw_input('Enter HTTP proxy [none]: ') + if httpProxy == '': httpProxy = None + credentials = UserCredentials(httpProxy) + localUserName = getpass.getuser() + for url in urls: + remote, protocol, netloc, path = remoteUrl(url) + if remote: + username, password, validInterval = promptForCredential(url, localUserName) + credential = UserCredential(username, password, validInterval) + credentials.add(url, credential) + return credentials + +def promptForCredential(url, localUserName): + remote, protocol, netloc, path = remoteUrl(url) + if protocol == 'ftp': + defaultUserName = 'anonymous' + else: + defaultUserName = localUserName + username = raw_input('Need credentials for URL %s\nUsername [%s]: ' \ + % (url, defaultUserName)) + if username == '': username = defaultUserName + password = '' + while password == '': + password = getpass.getpass() + validInterval = [0, 1, 0] + if password != '': + response = raw_input('Enter valid time period for credential [(days, hours, minutes) = 0 1 0]: ') + if response != '': + validInterval = response.split() + return (username, password, validInterval) + +class DirectoryWalker: + """Recursively walk directories using the protocol specified in a URL. + Sublclasses handle ftp, http, sftp, local file system, etc. + """ + def __init__(self, userCredentials=None, retries=3, sleepTime=5): + self.userCredentials = userCredentials + self.retries = retries + self.sleepTime = sleepTime + + def walk(self, top, walkDirectories=True): + """Recursively walk directories on a remote site to retrieve file lists. + """ + remote, protocol, netloc, path = remoteUrl(top) + status, dir_listing = self.retrieveDirList(top) + if status: + if len(dir_listing) == 0: + yield (top, [], [], []) + else: + (dirs, files, infos) = self.parseDirList(dir_listing, path) + yield (top, dirs, files, infos) + + if walkDirectories: + for dir in dirs: + # Depth-first recursion + for root, dirs, files, infos in self.walk(top + '/' + dir, walkDirectories): + yield (root, dirs, files, infos) + else: + warn('DirectoryWalker: error, unable to retrieve directory listing at', top) + yield (top, [], [], []) + + def retrieveDirList(self, url): + """Retrieve directory listing as a list of text lines. Returns (status, dirList).""" + pass + def parseDirList(self, dirList, path=None): + """Parse directory listing (text) and return three lists (dirs, files, fileInfos).""" + pass + +class FtpDirectoryWalker(DirectoryWalker): + """Recursively walk directories on an ftp site.""" + def __init__(self, userCredentials=None, retries=3, sleepTime=5): + DirectoryWalker.__init__(self, userCredentials, retries, sleepTime) + + def retrieveDirList(self, url): + """Retrieve a directory listing via ftp with retries. + """ + remote, protocol, netloc, path = remoteUrl(url) + credential = None + if self.userCredentials: + credential = self.userCredentials.forUrl(url) + dir = ''; dir_list = [] + ftp = FTP() + for i in range(self.retries): + try: + ftp.connect(netloc) + if credential is None or \ + credential.username == 'anonymous' or \ + credential.username == '': + ftp.login() + else: + ftp.login(credential.username, credential.password) + ftp.cwd(path) + ftp.retrlines('LIST', dir_list.append) + ftp.quit() + dir = '\n'.join(dir_list) + return (True, dir) + except: + pass + time.sleep(self.sleepTime) + warn('FtpDirectoryWalker: connect retry to ', netloc, path) + return (False, dir) + + def parseDirList(self, dir, path=None): + """Parse long directory listing returned by ftp or (ls -l). + Separate entries into directories and files. + """ + dirs = []; files = []; infos = [] + for entry in dir.split('\n'): + fields = entry.split() + if len(fields) < 7: continue + fn = fields[-1] + if fn == '.' or fn == '..': continue + if re.match('^d', fields[0])and fields[0][7] == 'r': + dirs.append(fn) + else: + files.append(fn) + info = FileInfo(entry, int(fields[4]), '-'.join(fields[5:8]), \ + fields[2], fields[3], fields[0]) + infos.append(info) + return (dirs, files, infos) + +class DirListingParser(object): + """Base class for directory listing parsers.""" + def __init__(self, regex): + self.regex = regex + self.compiledRegex = re.compile(self.regex) + + def parse(self, dir, listingHtml): + """Return (dirs, files, infos).""" + dirs = []; files = []; infos = [] + raise NotImplementedError, "Override this method in sub class." + +class ApacheDirListingParser(DirListingParser): + """Parser class for apache.""" + def parse(self, dir, listingHtml): + dirs = []; files = []; infos = [] + items = self.compiledRegex.findall(listingHtml) + for item, itemName in items: + if itemName.strip() == 'Parent Directory': continue + if isinstance(item, str): + name = item + else: + name, dateTime, size = item[:] + + if name.endswith('/'): + type = 'd' + dirs.append(name[:-1]) + else: + type = '-' + files.append(name) + #not doing file info + ''' + size = size.lower() + if size.endswith('k'): + size = int(size[:-1]) * 1024 + elif size.endswith('m'): + size = int(size[:-1]) * 1024 * 1024 + else: + size = -1 + line = '%s--------- 1 ? ? %15d %s %s' % (type, size, dateTime, name) + info = FileInfo(line, size, dateTime) + ''' + infos.append(None) + return (dirs, files, infos) + +class CDAACDirListingParser(DirListingParser): + """Parser class for CDAAC data server.""" + def parse(self, dir, listingHtml): + dirs = []; files = []; infos = [] + items = self.compiledRegex.findall(listingHtml) + for item, itemName in items: + if itemName.strip() == 'Parent Directory': continue + if isinstance(item, str): + name = item + else: + name, dateTime, size = item[:] + if name.endswith('/'): + type = 'd' + dirs.append(name) + else: + type = '-' + files.append(name) + #not doing file info + ''' + size = size.lower() + if size.endswith('k'): + size = int(size[:-1]) * 1024 + elif size.endswith('m'): + size = int(size[:-1]) * 1024 * 1024 + else: + size = -1 + line = '%s--------- 1 ? ? %15d %s %s' % (type, size, dateTime, name) + info = FileInfo(line, size, dateTime) + ''' + infos.append(None) + return (dirs, files, infos) + +class HttpDirectoryWalker(DirectoryWalker): + """Recursively walk directories on an http (web) site to retrieve file lists. + Handles many styles of HTML directory listings, but still very FRAGILE. + """ + + #list of directory listing parser plugins + DIR_LIST_REGEX_PLUGINS = [ + #apache 2.0.55 directory listing + ApacheDirListingParser(r'(?i)alt="\[.*?\]">\s*<A HREF="(?P<name>.*?)">(.*?)</A>'), + #CDAAC (COSMIC Data) + CDAACDirListingParser(r'(?i)<LI><A HREF="(?P<name>.*?)">(.*?)</A>'), + ] + + def __init__(self, userCredentials=None, retries=3, sleepTime=5): + DirectoryWalker.__init__(self, userCredentials, retries, sleepTime) + if self.userCredentials: + if self.userCredentials.httpProxy: + os.environ['http_proxy'] = self.userCredentials.httpProxy + # global kludge, default proxyHandler looks up proxy there + passwordMgr = urllib2.HTTPPasswordMgrWithDefaultRealm() + for url, cred in self.userCredentials.credentials.iteritems(): + passwordMgr.add_password(None, url, cred.username, cred.password) + authHandler = urllib2.HTTPBasicAuthHandler(passwordMgr) + opener = urllib2.build_opener(authHandler) + else: +# opener = urllib2.build_opener() + opener = None +# opener.add_headers = [('User-agent', 'Mozilla/5.0')] + self.opener = opener + + def retrieveDirList(self, url): + """Retrieve an HTML directory listing via http with retries. + """ +### url = os.path.join(url, 'contents.html') ### hack for DAP servers at GES-DISC + dir_listing = '' + proxies = {} + for i in range(self.retries): + try: + if self.opener: + response = self.opener.open(url) + else: + response = urllib.urlopen(url) + except IOError, e: + if hasattr(e, 'reason'): + warn('HttpDirectoryWalker: Error, failed to reach server because: %s' % e.reason) + elif hasattr(e, 'code'): + warn('HttpDirectoryWalker: Server could not fulfill request, error code %s' % e.code) + else: + dir_listing = response.read() + return (True, dir_listing) + time.sleep(self.sleepTime) + warn('HttpDirectoryWalker: retrying ', url) + return (False, dir_listing) + + reDirPath = re.compile(r'(?i)<H1>.*?Index of\s*?(\S+?)\s*?</H1>') + + def parseDirList(self, dir, path): + """Parse fragile HTML directory listings returned by various HTTP servers, + including Apache and OpenDAP. Separate entries into directories and files. + """ + dirs = []; files = []; infos = [] + if path: + match = HttpDirectoryWalker.reDirPath.search(dir) + if not match: + die('HttpDirectoryWalker: Cannot find directory name %s in HTML listing:\n%s' % (path, dir)) + dirName = match.group(1) + if dirName not in path: + warn('HttpDirectoryWalker: Directory name %s in HTML listing does not agree with path %s:\n%s' % (dirName, path, dir)) + + # Try to find directory lines that contain file info + reDirListWithStat = re.compile( \ + r'(?i)<A HREF=[\'"]*?(?P<name>[^\?].*?' + dirName + r'.*?)[\'"]*?>.*?</A>\s*(?P<dateTime>\S+ \S+)\s+?(?P<size>\S+)\s*?$') + items = reDirListWithStat.findall(dir) + # If not, then try to find simple directory lines + if len(items) == 0: + reDirList = re.compile( \ + r'(?i)<A HREF=[\'"]*?(?P<name>[^\?].*?' + dirName + r'.*?)[\'"]*?>.*?</A>') + items = reDirList.findall(dir) + + if len(items) != 0: + dateTime = '? ?'; size = '' + for item in items: + if isinstance(item, str): + name = item + else: + name, dateTime, size = item[:] + if dirName not in name: continue + + if name.endswith('/'): + type = 'd' + dirs.append(name) + else: + type = '-' + files.append(name) + size = size.lower() + if size.endswith('k'): + size = int(size[:-1]) * 1024 + elif size.endswith('m'): + size = int(size[:-1]) * 1024 * 1024 + else: + size = -1 + line = '%s--------- 1 ? ? %15d %s %s' % (type, size, dateTime, name) + info = FileInfo(line, size, dateTime) + infos.append(info) + print line + + #try plugins + else: + for plugin in self.DIR_LIST_REGEX_PLUGINS: + pluginResults = plugin.parse(dirName, dir) + if len(pluginResults[0]) != 0 or len(pluginResults[1]) != 0 or \ + len(pluginResults[2]) != 0: return pluginResults + + return (dirs, files, infos) + + +def walk(top, userCredentials=None, walkDirectories=True, topDown=True): + """Recursively walk directories to retrieve file lists. + Returns the topPath, contained subdirectories and files, and + optionally FileInfo objects (if info is included in protocol results). + Handles local directory paths and ftp/http protocols (URL's). + """ + remote, protocol, netloc, path = remoteUrl(top) + if remote: + if protocol == 'ftp': + ftpWalker = FtpDirectoryWalker(userCredentials) + for root, dirs, files, infos in ftpWalker.walk(top, walkDirectories): + yield (root, dirs, files, infos) + elif protocol == 'http': +# import pdb; pdb.set_trace() + httpWalker = HttpDirectoryWalker(userCredentials) + for root, dirs, files,infos in httpWalker.walk(top, walkDirectories): + yield (root, dirs, files, infos) + elif protocol == 'sftp': + sftpWalker = SftpDirectoryWalker(userCredentials) + for root, dirs, files,infos in sftpWalker.walk(top, walkDirectories): + yield (root, dirs, files, infos) + else: + die('filelist: Cannot handle protocol ', protocol) + else: + if walkDirectories: + for root, dirs, files in os.walk(top, topDown): + yield (root, dirs, files, []) + else: + files = os.listdir(top) + yield (top, [], files, []) + +def remoteUrl(url): + """Returns True if the URL is remote; also returns protocol, + net location (host:port), and path.""" + protocol, netloc, path, params, query, fragment = urlparse.urlparse(url) + if protocol == '': + return (False, protocol, netloc, path) + else: + return (True, protocol, netloc, path) + + +# utils +RE_WITH_SUBST_PATTERN = re.compile(r'^s/(.+)/(.+)/$') +def parse_re_with_subst(str): + match = RE_WITH_SUBST_PATTERN.match(str) + if match: + return (match.group(1), match.group(2)) + else: + return (str, None) + +def hostName(): + return socket.gethostbyaddr(socket.gethostname())[0] + +FILE_URL_PREFIX = 'file://' + hostName() +def makeFileUrl(file): + return FILE_URL_PREFIX + file + +def warn(*str): sys.stderr.write(' '.join(str) + '\n') +def die(str, status=1): warn(str); sys.exit(status) + +def main(): + """Main function for outside scripts to call.""" + + from sys import argv + + if len(argv) < 2: die(USAGE) + try: + opts, argv = getopt.getopt(argv[1:], 'hbcdf:ilqr:stuvw:x', + ['help', 'bottomUp', 'credentials', 'delete', 'directory', + 'fetchDir=', 'fetchIfNewer', 'fetchWithSubDirs', 'info', + 'list', 'quiet', 'regex=', 'size', 'topOnly', + 'url', 'verbose', 'wildcard=', 'xml']) + except getopt.GetoptError, (msg, bad_opt): + die("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg)) + + regSpecs = []; wildCards = []; matchUrl=False; walkDirectories = True + needCredentials = False; userCredentials = None + urlMode=False; xmlMode=False; quietMode=False; verboseMode=False; getFileInfo=False + fetchDir = None; fetchIfNewer=False; fetchWithSubDirs=False + directoryMode = False; deleteMode = False; topDown = True; listMode = False + + for opt, val in opts: + if opt in ('-h', '--help'): die(USAGE) + elif opt in ('-b', '--bottomUp'): topDown = False + elif opt in ('-c', '--credentials'): needCredentials = True + elif opt in ('-d', '--directory'): directoryMode=True + elif opt in ('--delete'): deleteMode=True + elif opt in ('-f', '--fetchDir'): fetchDir = val + # retrieve remote files to this dir + elif opt in ('--fetchIfNewer'): fetchIfNewer=True + # only fetch if src file is newer than existing dest file + elif opt in ('--fetchWithSubDirs'): fetchWithSubDirs=True + # mirror subdirectories when fetching + elif opt in ('-i', '--info'): getFileInfo=True + elif opt in ('-l', '--list'): listMode=True + elif opt in ('-m', '--matchUrl'): matchUrl=True + # regexs match entire URL/path, not just file name + elif opt in ('-q', '--quiet'): quietMode=True + # don't print files during walk + elif opt in ('-r', '--regex'): regSpecs.append(val) + elif opt in ('-s', '--size'): sizeMode=True + elif opt in ('-t', '--topOnly'): walkDirectories=False + elif opt in ('-u', '--url'): urlMode=True + # return URL's (file:, ftp:, http:, etc.) + elif opt in ('-v', '--verbose'): verboseMode=True + elif opt in ('-w', '--wildcard'): wildCards.append(val) + elif opt in ('-x', '--xml'): xmlMode=True # return list in XML format + else: die(USAGE) + +# import pdb; pdb.set_trace() + + matchedFiles, fetchedFiles, destinationFiles = \ + filelist(argv, regSpecs, wildCards, needCredentials, userCredentials, + matchAnyThenConstrain, None, matchUrl, walkDirectories, + urlMode, xmlMode, quietMode, verboseMode, getFileInfo, + fetchDir, fetchIfNewer, fetchWithSubDirs, + directoryMode, listMode, deleteMode, topDown) + + if quietMode: + if listMode == 'match': + print matchedFiles + elif listMode == 'fetch': + print fetchedFiles + elif listMode == 'destination': + print destinationFiles + else: + pass + + +if __name__ == '__main__': main()
http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/climatology/setup.py ---------------------------------------------------------------------- diff --git a/climatology/setup.py b/climatology/setup.py new file mode 100644 index 0000000..de2e030 --- /dev/null +++ b/climatology/setup.py @@ -0,0 +1,9 @@ + +from setuptools import setup + +setup(name='Climatology', + version='0.1dev0', + packages=['clim']) + + + http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/.gitignore ---------------------------------------------------------------------- diff --git a/data-access/.gitignore b/data-access/.gitignore new file mode 100644 index 0000000..7be00a9 --- /dev/null +++ b/data-access/.gitignore @@ -0,0 +1,4 @@ +*.c +build +dist +*.egg-info \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/README.md ---------------------------------------------------------------------- diff --git a/data-access/README.md b/data-access/README.md new file mode 100644 index 0000000..f2ebe7a --- /dev/null +++ b/data-access/README.md @@ -0,0 +1,58 @@ +data-access +===== + +Python module that provides API access to the NEXUS datastores. + +# Developer Setup + +## Database Setup + +1. Download and unzip [Apache Solr 5.3.1](http://archive.apache.org/dist/lucene/solr/5.3.1/) + + 1. Copy the [nexustiles](config/schemas/solr/nexustiles) directory into `$SOLR_INSTALL_DIR/server/solr` + 2. Copy the [dataset](config/schemas/solr/dataset) directory into `$SOLR_INSTALL_DIR/server/solr` + 3. Download [JTS Topology Suite v1.13](https://sourceforge.net/projects/jts-topo-suite/files/jts/1.13/) and extract the zip. + 4. From the exploded JTS zip, copy `$JTS_ZIP/lib/jts-1.13.jar` and `$JTS_ZIP/lib/jtsio-1.13.jar` into `$SOLR_INSTALL_DIR/server/lib/ext` + 5. Start Solr using `$SOLR_INSTALL_DIR/bin/solr start` then open up the admin page (http://localhost:8983) to make sure there are no errors + +2. Download and unzip [Apache Cassandra 2.2.x](http://cassandra.apache.org/download/) + + 1. Start cassandra `$CASSANDRA_INSTALL_DIR/bin/cassandra` + 2. Open a cqlsh session `$CASSANDRA_INSTALL_DIR/bin/cqlsh` + 3. Execute the DDL located in [nexustiles.cql](config/schemas/cassandra/nexustiles.cql) + +## Code Installation + +**NOTE** This project has a dependency on [nexus-messages](https://github.jpl.nasa.gov/thuang/nexus/tree/master/nexus-ingest/nexus-messages). Make sure nexus-messages is installed in the same environment you will be using for this module. + +1. Setup a separate conda env or activate an existing one + + ```` + conda create --name nexus-data-access python + source activate nexus-data-access + ```` + +2. Install conda dependencies + + ```` + conda install numpy + ```` + +3. Install cython `pip install cython` + +4. Run `python setup.py install` + +5. Run `python test/nexustilestest.py` to validate the installation worked + 1. If you get an error like the following + + ```` + /Users/user/.pyxbld/temp.macosx-10.5-x86_64-2.7/pyrex/cassandra/numpy_parser.c:266:10: fatal error: 'numpyFlags.h' file not found + ... + ImportError: Building module nexustiles.dao.CassandraProxy failed: ['ImportError: Building module cassandra.numpy_parser failed: ["CompileError: command \'gcc\' failed with exit status 1\\n"]\n'] + ```` + + It can be fixed by copying the `numpyFlags.h` file from the `cassandra` library to the python include library + + ```` + cp /path/to/anaconda/env/lib/python2.7/site-packages/cassandra/numpyFlags.h /path/to/anaconda/env/include/python2.7/numpyFlags.h + ```` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/cassandra/nexustiles.cql ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/cassandra/nexustiles.cql b/data-access/config/schemas/cassandra/nexustiles.cql new file mode 100644 index 0000000..f0b8e36 --- /dev/null +++ b/data-access/config/schemas/cassandra/nexustiles.cql @@ -0,0 +1,8 @@ +CREATE KEYSPACE IF NOT EXISTS nexustiles WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; + +DROP TABLE IF EXISTS nexustiles.sea_surface_temp; + +CREATE TABLE nexustiles.sea_surface_temp ( + tile_id uuid PRIMARY KEY, + tile_blob blob +); http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_ca.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_ca.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_ca.txt new file mode 100644 index 0000000..307a85f --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_ca.txt @@ -0,0 +1,8 @@ +# Set of Catalan contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +l +m +n +s +t http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_fr.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_fr.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_fr.txt new file mode 100644 index 0000000..f1bba51 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_fr.txt @@ -0,0 +1,15 @@ +# Set of French contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +l +m +t +qu +n +s +j +d +c +jusqu +quoiqu +lorsqu +puisqu http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_ga.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_ga.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_ga.txt new file mode 100644 index 0000000..9ebe7fa --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +m +b http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_it.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_it.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_it.txt new file mode 100644 index 0000000..cac0409 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/contractions_it.txt @@ -0,0 +1,23 @@ +# Set of Italian contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/hyphenations_ga.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/hyphenations_ga.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/hyphenations_ga.txt new file mode 100644 index 0000000..4d2642c --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/hyphenations_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish hyphenations for StopFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +h +n +t http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stemdict_nl.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stemdict_nl.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stemdict_nl.txt new file mode 100644 index 0000000..4410729 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stemdict_nl.txt @@ -0,0 +1,6 @@ +# Set of overrides for the dutch stemmer +# TODO: load this as a resource from the analyzer and sync it in build.xml +fiets fiets +bromfiets bromfiets +ei eier +kind kinder http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stoptags_ja.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stoptags_ja.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stoptags_ja.txt new file mode 100644 index 0000000..71b7508 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#åè© +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#åè©-ä¸è¬ +# +# noun-proper: Proper nouns where the sub-classification is undefined +#åè©-åºæåè© +# +# noun-proper-misc: miscellaneous proper nouns +#åè©-åºæåè©-ä¸è¬ +# +# noun-proper-person: Personal names where the sub-classification is undefined +#åè©-åºæåè©-人å +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. ãå¸ã®æ¹ +#åè©-åºæåè©-人å-ä¸è¬ +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. å±±ç° +#åè©-åºæåè©-人å-å§ +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太é +#åè©-åºæåè©-人å-å +# +# noun-proper-organization: Names representing organizations. +# e.g. éç£ç, NHK +#åè©-åºæåè©-çµç¹ +# +# noun-proper-place: Place names where the sub-classification is undefined +#åè©-åºæåè©-å°å +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. ã¢ã¸ã¢, ãã«ã»ãã, äº¬é½ +#åè©-åºæåè©-å°å-ä¸è¬ +# +# noun-proper-place-country: Country names. +# e.g. æ¥æ¬, ãªã¼ã¹ãã©ãªã¢ +#åè©-åºæåè©-å°å-å½ +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#åè©-代åè© +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. ãã, ãã, ããã¤, ããªã, ãã¡ãã¡, ããã¤, ã©ãã, ãªã«, ã¿ãªãã, ã¿ããª, ãããã, ãããã +#åè©-代åè©-ä¸è¬ +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ããã, ããã, ãããã, ããã, ãããã +#åè©-代åè©-ç¸®ç´ +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. éæ, 䏿, åå¾, å°é +#åè©-å¯è©å¯è½ +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (ãã, ã§ãã, ãªãã, ãã ãã) +# e.g. ã¤ã³ããã, æç, æªå, æªæ¦è¦é, ä¸å®å¿, ä¸åã +#åè©-ãµå¤æ¥ç¶ +# +# noun-adjective-base: The base form of adjectives, words that appear before 㪠("na") +# e.g. å¥åº·, 宿, é§ç®, ã ã +#åè©-形容åè©èªå¹¹ +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like ä½ (å), æ°. +# e.g. 0, 1, 2, ä½, æ°, å¹¾ +#åè©-æ° +# +# noun-affix: noun affixes where the sub-classification is undefined +#åè©-éèªç« +# +# noun-affix-misc: Of adnominalizers, the case-marker ã® ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. ããã¤ã, æ, ãã, ç²æ, æ°, ããã, å«ã, ãã, ç, ãã¨, äº, ãã¨, æ¯, ãã ã, 次第, +# é , ãã, æçº, ã¤ãã§, åºã§, ã¤ãã, ç©ãã, ç¹, ã©ãã, ã®, ã¯ã, ç, ã¯ãã¿, å¼¾ã¿, +# æå, ãµã, ãµã, æ¯ã, ã»ã, æ¹, æ¨, ãã®, ç©, è , ãã, æ , ããã, æä»¥, ãã, 訳, +# ãã, å²ã, å², ã-å£èª/, ãã-å£èª/ +#åè©-éèªç«-ä¸è¬ +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. ããã , é, ããã, æãå¥, ãã¨, å¾, ä½ã, 以å¤, 以é, 以å¾, 以ä¸, 以å, 䏿¹, ãã, +# ä¸, ãã¡, å , ãã, æã, ããã, éã, ãã, ã£ãã, çµæ, ãã, é , ãã, é, æä¸, ããªã, +# æä¸, ããã, èªä½, ãã³, 度, ãã, çº, ã¤ã©, é½åº¦, ã¨ãã, éã, ã¨ã, æ, ã¨ãã, æ, +# ã¨ãã, é端, ãªã, ä¸, ã®ã¡, å¾, ã°ãã, å ´å, æ¥, ã¶ã, å, ã»ã, ä», ã¾ã, å, ã¾ã¾, +# å, ä¾, ã¿ãã, ç¢å +#åè©-éèªç«-å¯è©å¯è½ +# +# noun-affix-aux: noun affixes treated as å©åè© ("auxiliary verb") in school grammars +# with the stem ãã(ã ) ("you(da)"). +# e.g. ãã, ãã, æ§ (ãã) +#åè©-éèªç«-å©åè©èªå¹¹ +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form 㪠(aux "da"). +# e.g. ã¿ãã, ãµã +#åè©-éèªç«-形容åè©èªå¹¹ +# +# noun-special: special nouns where the sub-classification is undefined. +#åè©-ç¹æ® +# +# noun-special-aux: The ããã ("souda") stem form that is used for reporting news, is +# treated as å©åè© ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. ãã +#åè©-ç¹æ®-å©åè©èªå¹¹ +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#åè©-æ¥å°¾ +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ã¬ã« or ã¿ã¤ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# æ¥å°¾èª ("suffix") and is usually the last element in a compound noun. +# e.g. ãã, ãã, æ¹, ç²æ (ãã), ããã, ãã¿, æ°å³, ããã¿, (ï½ãã) ã, 次第, æ¸ (ã) ã¿, +# ãã, (ã§ã)ã£ã, æ, 観, æ§, å¦, é¡, é¢, ç¨ +#åè©-æ¥å°¾-ä¸è¬ +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. å, æ§, è +#åè©-æ¥å°¾-人å +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. çº, å¸, ç +#åè©-æ¥å°¾-å°å +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before ã¹ã« ("suru"). +# e.g. å, è¦, åã, å ¥ã, è½ã¡, è²·ã +#åè©-æ¥å°¾-ãµå¤æ¥ç¶ +# +# noun-suffix-aux: The stem form of ããã (æ§æ ) that is used to indicate conditions, +# is treated as å©åè© ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. ãã +#åè©-æ¥å°¾-å©åè©èªå¹¹ +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula ã ("da"). +# e.g. ç, ã, ãã¡ +#åè©-æ¥å°¾-形容åè©èªå¹¹ +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. å¾ (ã), 以å¾, 以é, 以å, åå¾, ä¸, æ«, ä¸, æ (ã) +#åè©-æ¥å°¾-å¯è©å¯è½ +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 婿°è© ("classifier") and includes common nouns that attach +# to numbers. +# e.g. å, ã¤, æ¬, å, ãã¼ã»ã³ã, cm, kg, ã«æ, ãå½, åºç», æé, æå +#åè©-æ¥å°¾-婿°è© +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽ã) ã, (èã) æ¹ +#åè©-æ¥å°¾-ç¹æ® +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (æ¥æ¬) 対 (ã¢ã¡ãªã«), 対 (ã¢ã¡ãªã«), (3) 対 (5), (女åª) å ¼ (主婦) +#åè©-æ¥ç¶è©ç +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle 㦠("te") and are +# semantically verb-like. +# e.g. ããã, ã覧, 御覧, é æ´ +#åè©-åè©éèªç«ç +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for åè© å¼ç¨æåå ("noun quotation") +# is ããã ("iwaku"). +#åè©-å¼ç¨æåå +# +# noun-nai_adjective: Words that appear before the auxiliary verb ãªã ("nai") and +# behave like an adjective. +# e.g. ç³ã訳, 仿¹, ã¨ãã§ã, éã +#åè©-ãã¤å½¢å®¹è©èªå¹¹ +# +##### +# prefix: unclassified prefixes +#æ¥é è© +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. ã (æ°´), æ (æ°), å (社), æ (ï½æ°), é« (å質), ã (è¦äº), ã (ç«æ´¾) +#æ¥é è©-åè©æ¥ç¶ +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by ãªã/ãªãã/ãã ãã. +# e.g. ã (èªã¿ãªãã), ã (座ã) +#æ¥é è©-åè©æ¥ç¶ +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. ã (å¯ãã§ããã), ãã« (ã§ãã) +#æ¥é è©-å½¢å®¹è©æ¥ç¶ +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. ç´, ããã, æ¯æ +#æ¥é è©-æ°æ¥ç¶ +# +##### +# verb: unclassified verbs +#åè© +# +# verb-main: +#åè©-èªç« +# +# verb-auxiliary: +#åè©-éèªç« +# +# verb-suffix: +#åè©-æ¥å°¾ +# +##### +# adjective: unclassified adjectives +#å½¢å®¹è© +# +# adjective-main: +#形容è©-èªç« +# +# adjective-auxiliary: +#形容è©-éèªç« +# +# adjective-suffix: +#形容è©-æ¥å°¾ +# +##### +# adverb: unclassified adverbs +#å¯è© +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. ãããããã, å¤å +#å¯è©-ä¸è¬ +# +# adverb-particle_conjunction: Adverbs that can be followed by ã®, ã¯, ã«, +# ãª, ãã, ã , etc. +# e.g. ãããªã«, ãããªã«, ãããªã«, ãªã«ã, ãªãã§ã +#å¯è©-å©è©é¡æ¥ç¶ +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. ãã®, ãã®, ãã®, ã©ã®, ãããã, ãªãããã®, ä½ããã®, ããããª, ãããã, ãããã, ãããã, +# ã©ããã, ãããª, ãããª, ãããª, ã©ããª, 大ããª, å°ããª, ããããª, ã»ãã®, ãããã, +# ã(, ã) ãã (ãã¨ãªãã)ã, å¾®ã ãã, å ã ãã, åãªã, ãããªã, æãããåã, 亡ã +#é£ä½è© +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. ã, ããã©ã, ããã¦, ããã, ããã©ããã +æ¥ç¶è© +# +##### +# particle: unclassified particles. +å©è© +# +# particle-case: case particles where the subclassification is undefined. +å©è©-æ ¼å©è© +# +# particle-case-misc: Case particles. +# e.g. ãã, ã, ã§, ã¨, ã«, ã¸, ãã, ã, ã®, ã«ã¦ +å©è©-æ ¼å©è©-ä¸è¬ +# +# particle-case-quote: the "to" that appears after nouns, a personâs speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( ã ) 㨠(è¿°ã¹ã.), ( ã§ãã) 㨠(ãã¦å·è¡ç¶äº...) +å©è©-æ ¼å©è©-å¼ç¨ +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. ã¨ãã, ã¨ãã£ã, ã¨ããã, ã¨ãã¦, ã¨ã¨ãã«, ã¨å ±ã«, ã§ãã£ã¦, ã«ããã£ã¦, ã«å½ãã£ã¦, ã«å½ã£ã¦, +# ã«ããã, ã«å½ãã, ã«å½ã, ã«å½ãã, ã«ããã, ã«ããã¦, ã«æ¼ãã¦,ã«æ¼ã¦, ã«ããã, ã«æ¼ãã, +# ã«ãã, ã«ããã¦, ã«ããã, ã«é¢ã, ã«ãããã¦, ã«é¢ãã¦, ã«ãããã, ã«é¢ãã, ã«éã, +# ã«éãã¦, ã«ãããã, ã«å¾ã, ã«å¾ã, ã«ãããã£ã¦, ã«å¾ã£ã¦, ã«ããã, ã«å¯¾ã, ã«ãããã¦, +# ã«å¯¾ãã¦, ã«ãããã, ã«å¯¾ãã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¨ã£ã¦, +# ã«ã¨ã, ã«ã¾ã¤ãã, ã«ãã£ã¦, ã«ä¾ã£ã¦, ã«å ã£ã¦, ã«ãã, ã«ä¾ã, ã«å ã, ã«ãã, ã«ä¾ã, ã«å ã, +# ã«ããã£ã¦, ã«ããã, ããã£ã¦, ã以ã£ã¦, ãéã, ãéãã¦, ãéãã¦, ãããã£ã¦, ãããã, ãããã, +# ã£ã¦-å£èª/, ã¡ã ã-é¢è¥¿å¼ãã¨ããã/, (ä½) ã¦ãã (人)-å£èª/, ã£ã¦ãã-å£èª/, ã¨ããµ, ã¨ãããµ +å©è©-æ ¼å©è©-é£èª +# +# particle-conjunctive: +# e.g. ãã, ããã«ã¯, ã, ããã©, ããã©ã, ãã©, ã, ã¤ã¤, ã¦, ã§, ã¨, ã¨ããã, ã©ããã, ã¨ã, ã©ã, +# ãªãã, ãªã, ã®ã§, ã®ã«, ã°, ãã®ã®, ã ( ãã), ãããªã, (ããã) ãã(ãããªã)-å£èª/, +# (è¡ã£) ã¡ã(ãããªã)-å£èª/, (è¨ã£) ãã£ã¦ (ãããããªã)-å£èª/, (ããããªã)ã£ãã£ã¦ (å¹³æ°)-å£èª/ +å©è©-æ¥ç¶å©è© +# +# particle-dependency: +# e.g. ãã, ãã, ãã, ãã, ã¯, ã, ã +å©è©-ä¿å©è© +# +# particle-adverbial: +# e.g. ãã¦ã, ãã, ããã, ä½, ããã, ãã, (妿 ¡) ãã(ãããæµè¡ã£ã¦ãã)-å£èª/, +# (ãã)ããã (ãããªã)-å£èª/, ãã¤, (ç§) ãªã, ãªã©, (ç§) ãªã (ã«), (å ç) ãªãã (大å«ã)-å£èª/, +# (ç§) ãªãã, (å ç) ãªã㦠(大å«ã)-å£èª/, ã®ã¿, ã ã, (ç§) ã ã£ã¦-å£èª/, ã ã«, +# (å½¼)ã£ãã-å£èª/, (ãè¶) ã§ã (ããã), ç (ã¨ã), (ä»å¾) ã¨ã, ã°ãã, ã°ã£ã-å£èª/, ã°ã£ãã-å£èª/, +# ã»ã©, ç¨, ã¾ã§, è¿, (誰) ã (ã)([å©è©-æ ¼å©è©] ããã³ [å©è©-ä¿å©è©] ã®åã«ä½ç½®ããããã) +å©è©-å¯å©è© +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (æ¾å³¶) ã +å©è©-éæå©è© +# +# particle-coordinate: +# e.g. ã¨, ãã, ã ã®, ã ã, ã¨ã, ãªã, ã, ãã +å©è©-並ç«å©è© +# +# particle-final: +# e.g. ãã, ããã, ã, ã, (ã )ã£ã-å£èª/, (ã¨ã¾ã£ã¦ã) ã§-æ¹è¨/, ãª, ã, ãªã-å£èª/, ã, ã, ã, +# ãã-å£èª/, ãã-å£èª/, ãã-æ¹è¨/, ã®, ã®ã-å£èª/, ã, ã, ã¨, ãã-å£èª/, ã, ãã-å£èª/ +å©è©-çµå©è© +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) ãA ã B ãã. Ex:ã(å½å ã§éç¨ãã) ã,(æµ·å¤ã§éç¨ãã) ã (.)ã +# (b) Inside an adverb phrase. Ex:ã(幸ãã¨ãã) ã (, æ»è ã¯ããªãã£ã.)ã +# ã(ç¥ããå±ãããã) ã (, 試é¨ã«åæ ¼ãã.)ã +# (c) ããã®ããã«ã. Ex:ã(ä½ããªãã£ã) ã (ã®ããã«æ¯ãèã£ã.)ã +# e.g. ã +å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è© +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +å©è©-é£ä½å +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. ã«, 㨠+å©è©-å¯è©å +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. ããª, ãã, ( ããã ãã) ã«, (ããã) ã«ã(ãããã), (俺) ã (å®¶) +å©è©-ç¹æ® +# +##### +# auxiliary-verb: +å©åè© +# +##### +# interjection: Greetings and other exclamations. +# e.g. ãã¯ãã, ãã¯ãããããã¾ã, ããã«ã¡ã¯, ããã°ãã¯, ãããã¨ã, ã©ãããããã¨ã, ãããã¨ããããã¾ã, +# ããã ãã¾ã, ãã¡ãããã¾, ãããªã, ããããªã, ã¯ã, ããã, ããã, ããããªãã +#æåè© +# +##### +# symbol: unclassified Symbols. +è¨å· +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [ââ@$ãâ+] +è¨å·-ä¸è¬ +# +# symbol-comma: Commas +# e.g. [,ã] +è¨å·-èªç¹ +# +# symbol-period: Periods and full stops. +# e.g. [.ï¼ã] +è¨å·-å¥ç¹ +# +# symbol-space: Full-width whitespace. +è¨å·-ç©ºç½ +# +# symbol-open_bracket: +# e.g. [({ââãã] +è¨å·-æ¬å¼§é +# +# symbol-close_bracket: +# e.g. [)}ââããã] +è¨å·-æ¬å¼§é +# +# symbol-alphabetic: +#è¨å·-ã¢ã«ãã¡ããã +# +##### +# other: unclassified other +#ãã®ä» +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (ã )ã¡ +ãã®ä»-éæ +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. ãã®, ããã¨, ã㨠+ãã£ã©ã¼ +# +##### +# non-verbal: non-verbal sound. +éè¨èªé³ +# +##### +# fragment: +#èªæç +# +##### +# unknown: unknown part of speech. +#æªç¥èª +# +##### End of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_ar.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_ar.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_ar.txt new file mode 100644 index 0000000..046829d --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_ar.txt @@ -0,0 +1,125 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Cleaned on October 11, 2009 (not normalized, so use before normalization) +# This means that when modifying this list, you might need to add some +# redundant entries, for example containing forms with both Ø£ and ا +Ù Ù +ÙÙ Ù +Ù ÙÙØ§ +Ù ÙÙ +ÙÙ +ÙÙÙ +ÙÙÙØ§ +ÙÙÙ +Ù +Ù +ث٠+ا٠+Ø£Ù +ب +Ø¨ÙØ§ +ب٠+ا +Ø£ +ا٠+ا٠+Ø£Ù +Ø£Ù +ÙØ§ +ÙÙØ§ +Ø§ÙØ§ +Ø£ÙØ§ +Ø¥ÙØ§ +ÙÙÙ +٠ا +Ù٠ا +Ù٠ا +Ù٠ا +ع٠+٠ع +اذا +إذا +ا٠+Ø£Ù +Ø¥Ù +اÙÙØ§ +Ø£ÙÙØ§ +Ø¥ÙÙØ§ +اÙÙ +Ø£ÙÙ +Ø¥ÙÙ +با٠+بأ٠+ÙØ§Ù +ÙØ£Ù +ÙØ§Ù +ÙØ£Ù +ÙØ¥Ù +Ø§ÙØªÙ +Ø§ÙØªÙ +Ø§ÙØ°Ù +Ø§ÙØ°Ù +Ø§ÙØ°ÙÙ +اÙÙ +اÙÙ +Ø¥ÙÙ +Ø¥ÙÙ +عÙÙ +عÙÙÙØ§ +عÙÙÙ +ا٠ا +أ٠ا +إ٠ا +Ø§ÙØ¶Ø§ +Ø£ÙØ¶Ø§ +ÙÙ +ÙÙÙ +ÙÙ +ÙÙÙ +ÙÙ +ÙÙÙ +ÙÙ +ÙÙ +ÙÙ +ÙÙÙ +ÙÙÙ +ÙÙÙ +ÙÙÙ +ÙÙÙ +ÙÙÙ +Ø§ÙØª +Ø£ÙØª +ÙÙ +ÙÙØ§ +ÙÙ +ÙØ°Ù +ÙØ°Ø§ +تÙÙ +ذÙÙ +ÙÙØ§Ù +ÙØ§Ùت +ÙØ§Ù +ÙÙÙÙ +تÙÙÙ +ÙÙØ§Ùت +ÙÙØ§Ù +ØºÙØ± +بعض +ÙØ¯ +ÙØÙ +بÙÙ +بÙÙ٠ا +Ù ÙØ° +ض٠٠+ØÙØ« +Ø§ÙØ§Ù +Ø§ÙØ¢Ù +Ø®ÙØ§Ù +بعد +ÙØ¨Ù +ØØªÙ +Ø¹ÙØ¯ +Ø¹ÙØ¯Ù ا +ÙØ¯Ù +Ø¬Ù ÙØ¹ http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_bg.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_bg.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_bg.txt new file mode 100644 index 0000000..1ae4ba2 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_bg.txt @@ -0,0 +1,193 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +а +аз +ако +ала +бе +без +беÑе +би +бил +била +били +било +близо +бÑÐ´Ð°Ñ +бÑде +бÑÑ Ð° +в +Ð²Ð°Ñ +Ð²Ð°Ñ +ваÑа +веÑоÑÑно +веÑе +взема +ви +вие +винаги +вÑе +вÑеки +вÑиÑки +вÑиÑко +вÑÑка +вÑв +вÑпÑеки +вÑÑÑ Ñ +г +ги +главно +го +д +да +дали +до +докаÑо +докога +доÑи +доÑега +доÑÑа +е +едва +един +еÑо +за +зад +заедно +заÑади +заÑега +заÑова +заÑо +заÑоÑо +и +из +или +им +има +Ð¸Ð¼Ð°Ñ +иÑка +й +каза +как +каква +какво +какÑо +какÑв +каÑо +кога +когаÑо +коеÑо +коиÑо +кой +койÑо +колко +коÑÑо +кÑде +кÑдеÑо +кÑм +ли +м +ме +Ð¼ÐµÐ¶Ð´Ñ +мен +ми +мнозина +мога +Ð¼Ð¾Ð³Ð°Ñ +може +Ð¼Ð¾Ð»Ñ +моменÑа +Ð¼Ñ +н +на +над +назад +най +напÑави +напÑед +напÑÐ¸Ð¼ÐµÑ +Ð½Ð°Ñ +не +него +Ð½ÐµÑ +ни +ние +никой +ниÑо +но +нÑкои +нÑкой +нÑма +обаÑе +около +оÑвен +оÑобено +Ð¾Ñ +оÑгоÑе +оÑново +оÑе +пак +по +повеÑе +повеÑеÑо +под +поне +поÑади +поÑле +поÑÑи +пÑави +пÑед +пÑеди +пÑез +пÑи +пÑк +пÑÑво +Ñ +Ñа +Ñамо +Ñе +Ñега +Ñи +ÑкоÑо +Ñлед +Ñме +ÑпоÑед +ÑÑед +ÑÑеÑÑ +ÑÑе +ÑÑм +ÑÑÑ +ÑÑÑо +Ñ +Ñази +Ñака +Ñакива +ÑакÑв +Ñам +Ñвой +Ñе +Ñези +Ñи +Ñн +Ñо +Ñова +Ñогава +Ñози +Ñой +Ñолкова +ÑоÑно +ÑÑÑбва +ÑÑк +ÑÑй +ÑÑ +ÑÑÑ +Ñ +Ñ Ð°ÑеÑва +Ñ +Ñе +ÑеÑÑо +ÑÑез +Ñе +Ñом +Ñ http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_ca.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_ca.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_ca.txt new file mode 100644 index 0000000..3da65de --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_ca.txt @@ -0,0 +1,220 @@ +# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) +a +abans +acà +ah +aixà +això +al +als +aleshores +algun +alguna +algunes +alguns +alhora +allà +allà +allò +altra +altre +altres +amb +ambdós +ambdues +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquà +baix +cada +cadascú +cadascuna +cadascunes +cadascuns +com +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +després +dins +dintre +donat +doncs +durant +e +eh +el +els +em +en +encara +ens +entre +érem +eren +éreu +es +és +esta +està +està vem +estaven +està veu +esteu +et +etc +ets +fins +fora +gairebé +ha +han +has +havia +he +hem +heu +hi +ho +i +igual +iguals +ja +l'hi +la +les +li +li'n +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +més +meu +meus +meva +meves +molt +molta +moltes +molts +mon +mons +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +però +perquè +poc +poca +pocs +poques +potser +propi +qual +quals +quan +quant +que +què +quelcom +qui +quin +quina +quines +quins +s'ha +s'han +sa +semblant +semblants +ses +seu +seus +seva +seva +seves +si +sobre +sobretot +sóc +solament +sols +son +són +sons +sota +sou +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +teu +teus +teva +teves +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_cz.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_cz.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_cz.txt new file mode 100644 index 0000000..53c6097 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_cz.txt @@ -0,0 +1,172 @@ +a +s +k +o +i +u +v +z +dnes +cz +tÃmto +budeÅ¡ +budem +byli +jseÅ¡ +můj +svým +ta +tomto +tohle +tuto +tyto +jej +zda +proÄ +máte +tato +kam +tohoto +kdo +kteÅà +mi +nám +tom +tomuto +mÃt +nic +proto +kterou +byla +toho +protože +asi +ho +naÅ¡i +napiÅ¡te +re +což +tÃm +takže +svých +jejà +svými +jste +aj +tu +tedy +teto +bylo +kde +ke +pravé +ji +nad +nejsou +Äi +pod +téma +mezi +pÅes +ty +pak +vám +ani +když +vÅ¡ak +neg +jsem +tento +Älánku +Älánky +aby +jsme +pÅed +pta +jejich +byl +jeÅ¡tÄ +až +bez +také +pouze +prvnà +vaÅ¡e +která +nás +nový +tipy +pokud +může +strana +jeho +své +jiné +zprávy +nové +nenà +vás +jen +podle +zde +už +být +vÃce +bude +již +než +který +by +které +co +nebo +ten +tak +má +pÅi +od +po +jsou +jak +dalšà +ale +si +se +ve +to +jako +za +zpÄt +ze +do +pro +je +na +atd +atp +jakmile +pÅiÄemž +já +on +ona +ono +oni +ony +my +vy +jà +ji +mÄ +mne +jemu +tomu +tÄm +tÄmu +nÄmu +nÄmuž +jehož +jÞ +jelikož +jež +jakož +naÄež http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_da.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_da.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_da.txt new file mode 100644 index 0000000..42e6145 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_da.txt @@ -0,0 +1,110 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +pÃ¥ | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +nÃ¥r | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +ogsÃ¥ | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sÃ¥dan | such, like this/like that http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_de.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_de.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_de.txt new file mode 100644 index 0000000..86525e7 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_de.txt @@ -0,0 +1,294 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daà | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_el.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_el.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_el.txt new file mode 100644 index 0000000..232681f --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_el.txt @@ -0,0 +1,78 @@ +# Lucene Greek Stopwords list +# Note: by default this file is used after GreekLowerCaseFilter, +# so when modifying this file use 'Ï' instead of 'Ï' +ο +η +Ïο +οι +Ïα +ÏÎ¿Ï +ÏÎ·Ï +ÏÏν +Ïον +Ïην +και +κι +κ +ειμαι +ειÏαι +ειναι +ειμαÏÏε +ειÏÏε +ÏÏο +ÏÏον +ÏÏη +ÏÏην +μα +αλλα +αÏο +για +ÏÏÎ¿Ï +με +Ïε +ÏÏ +ÏαÏα +ανÏι +καÏα +μεÏα +θα +να +δε +δεν +μη +μην +εÏι +ÎµÎ½Ï +εαν +αν +ÏοÏε +ÏÎ¿Ï +ÏÏÏ +ÏÎ¿Î¹Î¿Ï +Ïοια +Ïοιο +Ïοιοι +ÏÎ¿Î¹ÎµÏ +ÏοιÏν +ÏÎ¿Î¹Î¿Ï Ï +Î±Ï ÏÎ¿Ï +Î±Ï Ïη +Î±Ï Ïο +Î±Ï Ïοι +Î±Ï ÏÏν +Î±Ï ÏÎ¿Ï Ï +Î±Ï ÏÎµÏ +Î±Ï Ïα +ÎµÎºÎµÎ¹Î½Î¿Ï +εκεινη +εκεινο +εκεινοι +ÎµÎºÎµÎ¹Î½ÎµÏ +εκεινα +εκεινÏν +ÎµÎºÎµÎ¹Î½Î¿Ï Ï +οÏÏÏ +ομÏÏ +ιÏÏÏ +οÏο +οÏι http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_en.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_en.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_en.txt new file mode 100644 index 0000000..2c164c0 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_es.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_es.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_es.txt new file mode 100644 index 0000000..487d78c --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_es.txt @@ -0,0 +1,356 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sà | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | habÃa from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mà | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mÃo | mine +mÃa | +mÃos | +mÃas | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estarÃa +estarÃas +estarÃamos +estarÃais +estarÃan +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habrÃa +habrÃas +habrÃamos +habrÃais +habrÃan +habÃa +habÃas +habÃamos +habÃais +habÃan +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +serÃa +serÃas +serÃamos +serÃais +serÃan +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendrÃa +tendrÃas +tendrÃamos +tendrÃais +tendrÃan +tenÃa +tenÃas +tenÃamos +tenÃais +tenÃan +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + http://git-wip-us.apache.org/repos/asf/incubator-sdap-nexus/blob/ff98fa34/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_eu.txt ---------------------------------------------------------------------- diff --git a/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_eu.txt b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_eu.txt new file mode 100644 index 0000000..25f1db9 --- /dev/null +++ b/data-access/config/schemas/solr-7.1.0/nexustiles/conf/lang/stopwords_eu.txt @@ -0,0 +1,99 @@ +# example set of basque stopwords +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten
