On Tuesday, April 10, 2018 at 3:28:05 AM UTC-4, Thomas Jollans wrote: > On 2018-04-10 07:06, T Berger wrote: > > This is the first time I've joined a google group and I don't understand > > the setup. Why are most of the posts in this group unrelated to python, and > > how do I filter this junk (sorry) out? > > > > Welcome to python-list/comp.lang.python! > > This isn't originally a Google group. Google just mirrors the old USENET > group, which is awash with spam. > > There is also a mailing list version of this group (posts are mirrored > both ways) at https://mail.python.org/mailman/listinfo/python-list > > The mailing list has proper spam filtering and some moderation. None (or > barely any) of the regulars use Google Groups. Some people use USENET > directly and maintain their own extensive filtering regime to make it > readable. Probably most of us use the mailing list, because it's just so > much nicer! > > -- Thomas
Here's my python code for filtering google groups again. You need to bookmark pyc files to run them from the bookmarks in firefox. You also need to create the bannedAuthors.txt and bannedSubjects.txt files. # remove banned author and authors with mostly caps # to compile to pyc #>>>import py_compile #>>>py_compile.compile("file.py") import urllib2 import webbrowser import os from bs4 import BeautifulSoup import argparse class Usage(Exception): def __init__(self, msg): self.msg = msg PALEMOON = 'Mozilla/5.0 (Windows NT 6.1; WOW64) KHTML/4.11 Gecko/20130308 Firefox/33.0 (PaleMoon/25.2)' WATERFOX = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:40.0) Gecko/20100101 Firefox/51.1.0 Waterfox/51.1.0' USERAGENTBASE = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:40.0) Gecko/20100101 ' BROWSERPATH = 'C:\\"Program Files"\\Waterfox\\waterfox.exe' FILENAME = 'C:\\Pystuff\\pygroup.htm' SEDFILENAME = 'C:\\Pystuff\\SED.htm' WEBPAGE_START = "https://groups.google.com/forum/?_escaped_fragment_=forum/" PYGROUP_WEBPAGE = "comp.lang.python%5B" SED_WEBPAGE = "sci.electronics.design%5B" WEBPAGE_END = "%5D" BANNED_AUTHORS_FILE = 'C:\\Pystuff\\bannedAuthors.txt' BANNED_SUBJECTS_FILE = 'C:\\Pystuff\\bannedSubjects.txt' def getUserAgentVersion(): """ get the useragent version returns agentVersion -- user agent version in format Firefox/51.0.1 Waterfox/51.0.1 """ bvers = os.popen(BROWSERPATH + " -v").read() bversList = bvers.split() agentVersion = 'Firefox/' + bversList[2] + ' ' + bversList[1] + '/' + bversList[2] return agentVersion def getwebpage(url): """ Open a webpage url -- the url to the webpage returns page -- the source for the webpage """ user_agent = USERAGENTBASE + getUserAgentVersion() headers = { 'User-Agent' : user_agent } req = urllib2.Request(url, None, headers) response = urllib2.urlopen(req) page = response.read() return page def getBannedAuthors(): """ Convert the banned authors text file into a list returns bannedAuthors -- list of banned author strings """ f = open(BANNED_AUTHORS_FILE, 'r') bannedAuthors = f.read().split('\n') f.close() return bannedAuthors def getBannedSubjects(): """ Convert the banned subjects text file into a list returns bannedAuthors -- list of banned author strings """ f = open(BANNED_SUBJECTS_FILE, 'r') bannedSubjects = f.read().split('\n') f.close() return bannedSubjects def removeBadAuthors(html_doc, filecode): """ Remove posts from google group by authors that are mostly caps or on the Banned List html_doc -- an html document """ bannedAuthors = getBannedAuthors() bannedSubjects = getBannedSubjects() #print bannedAuthors soup = BeautifulSoup(html_doc) #print soup.prettify() post = soup.find("tr") postcount = 0 banNoneCount = 0 banNameCount = 0 banBigCount = 0 banSubjectCount = 0 while post is not None: postcount += 1 author = post.find("td", "author") subject = post.find("td", "subject") if author is None or subject is None: print "Author is None" oldpost = post post = oldpost.find_next_sibling('tr') oldpost.decompose() postcount = postcount - 1 banNoneCount += 1 else: aname = author.get_text() print aname.encode("ascii", "ignore") asubject = ((subject.get_text()).lower()).replace(" ", "") bannedsubject = False for badsubject in bannedSubjects: print "BAD SUBJECT", badsubject if badsubject in asubject and len(badsubject) > 3: print "ASUBJECT", asubject.encode("ascii", "ignore") bannedsubject = True break if bannedsubject: print "Subject is Banned" oldpost = post post = oldpost.find_next_sibling('tr') oldpost.decompose() postcount = postcount - 1 banSubjectCount += 1 elif aname in bannedAuthors or \ 'smtb' in aname: print "Author is Banned" oldpost = post post = oldpost.find_next_sibling('tr') oldpost.decompose() postcount = postcount - 1 banNameCount += 1 else: print author numCaps = 1.0 * sum(1 for c in aname if c.isupper()) ratio = numCaps/(1.0*len(aname)) print ratio oldpost = post post = oldpost.find_next_sibling('tr') if ratio > 0.7 or len(aname) > 35: oldpost.decompose() postcount = postcount - 1 banBigCount += 1 print "BIG" if post is None: print "Post is NONE" f = open(FILENAME, filecode) f.write(soup.prettify().encode('ascii', 'ignore') + '<br>\n\r') f.write('<a> Banned No Name: ' + str(banNoneCount) + '</a>, ') f.write('<a> Banned Name: ' + str(banNameCount) + '</a>, ') f.write('<a> All Uppercase Name: ' + str(banBigCount) + '</a>, ') f.write('<a> Banned Subject: ' + str(banSubjectCount) + '</a>, ') f.write('<a> Total Banned: ' + str(banNoneCount +banNameCount + banBigCount + banSubjectCount) + '</a><br>\n\r') f.close() return postcount def main(sed = None): if sed is None: parser = argparse.ArgumentParser() parser.add_argument('-s', '--sed' , help="load sci.electronics.design group", action="store_true") args = parser.parse_args() if args.sed: webgroup = SED_WEBPAGE else: webgroup = PYGROUP_WEBPAGE else: if sed: webgroup = SED_WEBPAGE else: webgroup = PYGROUP_WEBPAGE postcount = 0 numberOposts = 0 filecode = 'w' while postcount < 10: webpage = WEBPAGE_START + webgroup + str(numberOposts + 1) + '-' + str(numberOposts + 50) + WEBPAGE_END print webpage html_doc = getwebpage(webpage) postcount += removeBadAuthors(html_doc, filecode) if postcount < 10: numberOposts += 50 filecode = 'a' print "postcount less than 10", postcount print "number of posts", numberOposts webbrowser.open(FILENAME) print 'done' if __name__ == "__main__": main() -- https://mail.python.org/mailman/listinfo/python-list