I apologize for posting to the list, but I've had no luck since 14 Jan
2006, sending the following reply via eMail:
Tor Olav Stava wrote this on Mon, Jan 09, 2006 at 10:30:59AM +0100.
My reply is below.
> I'll be happy to test any code on my systems.
Sorry for the delay. Attached, please find a replacement for
HarvestPackages.py.
It takes a long time to run, but doesn't do a lot of Internet access.
If you find it downloading stuff, you probably need to install the
*xml docbook*.
HarvestPackages.py produces a copy of pkgs.dat on standard output, so
you want to redirect output to that file. It finds 369 packages in
the BLFS BOOK.
The AuditPackages.py script needs to be changed to "fix" packages that
have alternate dependencies. This is not done, yet.
Please let me know how HarvestPackages.py works for you now.
--
.. Chuck Rhode, Sheboygan, WI, USA
.. 30?F. Wind NNW 18 mph. Cloudy.
#!/usr/bin/python
# HarvestPackages.py
# 2004 FEB 28 . ccr
# Extract a database of package names, source archives, dependencies,
# and installation procedures from the xml version of the Beyond
# Linux from Scratch book.
# 2006 JAN 09 . ccr . Migrate to BLFS 6.1. Switch from xml.sax to
# . . libxml2.
import sys
import os
import optparse
import PackageDB
import libxml2
ZERO=0
SPACE=' '
NULL=''
NUL='\x00'
NA=-1
class cAttributes(dict):
def getNames(self):
return self.keys()
def getValue(self,aKey,aDefault=None):
return self.get(aKey,aDefault)
class cDocHandler(object):
def __init__(self,aReader,aParseEntities=False):
self.fReader=aReader
return
def startDocument(self):
# print 'startDocument'
return
def endDocument(self):
# print 'endDocument'
return
def startElement(self,aName,aAttrs):
# print 'startElement',aName,aAttrs
return
def endElement(self,aName,aIsEmpty=False):
# print 'endElement',aName
return
def characters(self,aContent):
# print 'characters',aContent
return
def ignorableWhitespace(self):
# print 'ignorableWhitespace'
return
def processingInstruction(self,aTarget,aData):
# print 'processingInstruction',aTarget,aData
return
def skippedEntity(self,aName):
# print 'skippedEntity'
return
def Read(self):
return self.fReader.Read()
def ReadLoop(self):
while self.Read()==1:
__Type=self.fReader.NodeType()
__Name=self.fReader.Name()
__IsEmpty=self.fReader.IsEmptyElement()
if self.fReader.HasValue():
__Text=self.fReader.Value()
else:
__Text=None
if self.fReader.HasAttributes():
__Attrs={}
while self.fReader.MoveToNextAttribute():
__Key=self.fReader.Name()
__Value=self.fReader.Value()
__Attrs[__Key]=__Value
else:
__Attrs=None
if __Type in [libxml2.XML_READER_TYPE_NONE]:
pass
elif __Type in [libxml2.XML_READER_TYPE_ELEMENT]:
self.startElement(__Name,__Attrs)
if __IsEmpty:
self.endElement(__Name,__IsEmpty)
elif __Type in [libxml2.XML_READER_TYPE_ATTRIBUTE]:
pass # Handled above.
elif __Type in [libxml2.XML_READER_TYPE_TEXT]:
self.characters(__Text)
elif __Type in [libxml2.XML_READER_TYPE_CDATA]:
self.characters(__Text)
elif __Type in [libxml2.XML_READER_TYPE_ENTITY_REFERENCE]:
self.skippedEntity(__Name)
elif __Type in [libxml2.XML_READER_TYPE_ENTITY]:
pass
elif __Type in [libxml2.XML_READER_TYPE_PROCESSING_INSTRUCTION]:
self.processingInstruction(__Name,__Text)
elif __Type in [libxml2.XML_READER_TYPE_COMMENT]:
pass
elif __Type in [libxml2.XML_READER_TYPE_DOCUMENT]:
self.startDocument()
elif __Type in [libxml2.XML_READER_TYPE_DOCUMENT_TYPE]:
pass
elif __Type in [libxml2.XML_READER_TYPE_DOCUMENT_FRAGMENT]:
pass
elif __Type in [libxml2.XML_READER_TYPE_NOTATION]:
pass
elif __Type in [libxml2.XML_READER_TYPE_WHITESPACE]:
self.ignorableWhitespace()
elif __Type in [libxml2.XML_READER_TYPE_SIGNIFICANT_WHITESPACE]:
self.characters(__Text)
elif __Type in [libxml2.XML_READER_TYPE_END_ELEMENT]:
self.endElement(__Name)
elif __Type in [libxml2.XML_READER_TYPE_END_ENTITY]:
pass
elif __Type in [libxml2.XML_READER_TYPE_XML_DECLARATION]:
pass
return
class cStateStack(list):
def __init__(self):
self.fPoppedItem=None
return
def Push(self,aItem):
self.append(aItem)
return
def Pop(self,aItem,aEmptyElt=False):
if self.IsMostRecently(['%s*'%aItem],aIgnoreCase=False):
__Result=self.pop()
if aEmptyElt:
self.fPoppedItem=__Result
else:
self.fPoppedItem=None
return __Result
elif aItem==self.fPoppedItem: # We've inexplicably received an
endElement for an empty element.
self.fPoppedItem=None
return None
else:
sys.stderr.write('''
State stack corrupted. %s closing but not most recent. Stack
unchanged.
'''%aItem)
print self
sys.exit('halt')
def GetDepth(self):
return len(self)
def IsMostRecently(self,aList,aIgnoreCase=True):
def Test(aDepth):
if aIgnoreCase:
__EltList=aList[-aDepth].lower()
__EltStack=self[-aDepth].lower()
else:
__EltList=aList[-aDepth]
__EltStack=self[-aDepth]
if __EltList==__EltStack:
return True
__Pos=__EltList.find('*')
if __Pos==NA:
return False
else:
return (__EltList[:__Pos]==__EltStack[:__Pos])
__Depth=len(aList)
if __Depth<=self.GetDepth():
__Result=Test(__Depth)
while (__Depth>1) and __Result:
__Depth-=1
__Result=Test(__Depth)
return __Result
else:
return False
class cBook(cDocHandler):
def __init__(self,aReader,aPackageList):
cDocHandler.__init__(self,aReader)
self.fPackageList=aPackageList
self.fStateStack=cStateStack()
self.fBuffer=NULL
self.fPackageName=NULL
self.fPackageVersion=NULL
self.fURL=NULL
self.fExternal=NULL
self.fPackage=None
self.fCapture=False
return
def startElement(self,aName,aAttrs):
self.ProcessElement(aName,'init',aAttrs=aAttrs)
return
def endElement(self,aName,aIsEmpty=False):
self.ProcessElement(aName,'term',aIsEmpty=aIsEmpty)
return
def characters(self,aContent):
if self.fCapture:
self.fBuffer+=aContent
return
def ProcessElement(self,aName,aFunction,aAttrs=None,aIsEmpty=False):
def Buffer(aIsInit):
self.fCapture=aIsInit
if self.fCapture:
self.fBuffer=NULL
return
def PushElement(aIsInit,aName):
if aIsInit:
if aName in ['bridgehead']:
aName+=__SafeAttrs.get('renderas',NULL)
self.fStateStack.Push(aName)
return
def PopElement(aIsInit,aName,aIsEmpty):
if not aIsInit:
self.fStateStack.Pop(aName,aIsEmpty)
return
def ProcessSect1(aIsInit):
if aIsInit:
self.fPackageName=__SafeAttrs.get('id')
self.fPackageVersion=__SafeAttrs.get('xreflabel')
else:
if self.fPackage==None:
pass
else:
self.fPackageList.Append(self.fPackage)
self.fPackage=None
return
def ProcessTitle(aIsInit):
Buffer(aIsInit)
if not aIsInit:
self.fTitle=self.fBuffer
if self.fPackageVersion in [NULL,None]:
self.fPackageVersion=self.fTitle
return
def ProcessPackage(aIsInit):
if not aIsInit:
if self.fTitle.lower().startswith('package information'):
if self.fPackage==None:
self.fPackage=PackageDB.cPackage()
self.fPackage.SetNameVersion(self.fPackageName,
self.fPackageVersion)
if OPTS.Verbose==True:
sys.stderr.write('%s\n'%self.fPackage.GetNameVersion())
return
def ProcessURL(aIsInit):
if aIsInit:
self.fURL=__SafeAttrs.get('url').strip()
else:
if self.fPackage==None:
pass
else:
if self.fTitle.lower().startswith('package information'):
if self.fURL in [NULL,None]:
pass
else:
self.fPackage.fArchiveList=[self.fURL]
elif self.fTitle.lower().startswith(
'additional download'):
self.fPackage.AppendPatch(self.fURL)
return
def ProcessConjunction(aIsInit):
if self.fTitle.lower().startswith('required') or \
self.fTitle.lower().startswith('recommended'):
Buffer(aIsInit)
if aIsInit:
pass
else:
__Tokens=[__Tok.lower() for __Tok in self.fBuffer.split()]
if 'or' in __Tokens:
self.fPackage.AppendDependency('or')
return
def ProcessExternal(aIsInit):
if aIsInit:
self.fExternal=__SafeAttrs.get('linkend')
else:
if self.fTitle.lower().startswith('required') or \
self.fTitle.lower().startswith('recommended'):
if self.fPackage==None:
pass
else:
self.fPackage.AppendDependency(self.fExternal)
return
def ProcessForeign(aIsInit):
Buffer(aIsInit)
if not aIsInit:
if self.fTitle.lower().startswith('required') or \
self.fTitle.lower().startswith('recommended'):
if self.fPackage==None:
pass
else:
self.fPackage.AppendDependency(self.fBuffer)
return
def ProcessCode(aIsInit):
Buffer(aIsInit)
if not aIsInit:
if self.fPackage==None:
pass
else:
self.fPackage.AppendCommand('#
%s\n%s'%(self.fTitle,self.fBuffer))
return
__IsInit=aFunction in ['init']
__SafeAttrs={}
if aAttrs is None:
pass
else:
for (__Key,__Value) in aAttrs.iteritems():
__SafeAttrs[__Key.lower()]=__Value
PushElement(__IsInit,aName)
if self.fStateStack.IsMostRecently(['sect1']):
ProcessSect1(__IsInit)
elif (self.fStateStack.IsMostRecently(['title']) or
self.fStateStack.IsMostRecently(['bridgehead*'])):
ProcessTitle(__IsInit)
if (self.fStateStack.IsMostRecently(['sect3','title']) or
self.fStateStack.IsMostRecently(['bridgeheadsect3'])):
ProcessPackage(__IsInit)
elif
self.fStateStack.IsMostRecently(['itemizedlist','listitem','para','ulink']):
ProcessURL(__IsInit)
elif self.fStateStack.IsMostRecently(['para']):
ProcessConjunction(__IsInit)
elif self.fStateStack.IsMostRecently(['para','xref']):
ProcessExternal(__IsInit)
elif self.fStateStack.IsMostRecently(['para','ulink']):
ProcessForeign(__IsInit)
elif self.fStateStack.IsMostRecently(['screen','userinput']):
ProcessCode(__IsInit)
PopElement(__IsInit,aName,aIsEmpty)
return
__Parser=optparse.OptionParser()
__DefaultIndexDoc=os.path.expanduser('~/BLFS/BOOK/index.xml')
__Parser.add_option('-I','--IndexDoc',
help='Root of the XML document. Default is %s.' %
(__DefaultIndexDoc),
default=__DefaultIndexDoc)
__Parser.add_option('-P','--PackageDB',
help='Output file name to receive package database.
Default is > stdout.')
__Parser.add_option('-V','--Verbose',
action='store_true',
help='List package names during processing.',
default=False)
(OPTS,__Args)=__Parser.parse_args()
if len(__Args)>ZERO:
__Parser.error('Arguments are prohibited.')
if os.path.exists(OPTS.IndexDoc):
pass
else:
__Parser.error(OPTS.IndexDoc+' not found.')
if OPTS.PackageDB in [None,NULL,'> stdout']:
OPTS.PackageDB=None
__Flags=(
libxml2.XML_PARSE_XINCLUDE| # Expand xincludes.
libxml2.XML_PARSE_NOENT| # Expand entities. I know this doesn't look
right.
libxml2.XML_PARSE_NOBLANKS| # Suppress whitespace.
ZERO)
__Book=cBook(libxml2.readerForFile(OPTS.IndexDoc,
'ascii',
__Flags,
),
PackageDB.PackageList,
)
__Book.ReadLoop()
PackageDB.Store(OPTS.PackageDB)
# Fin
--
http://linuxfromscratch.org/mailman/listinfo/blfs-support
FAQ: http://www.linuxfromscratch.org/blfs/faq.html
Unsubscribe: See the above information page