renukesh nk wrote: > i want to download zip files from website , my script first lists all the > url links to a text file and then fetches each url and tries to download > zip files. > > > but i am getting error as below: > Running script.. > https://sagamusix.dehttps:// > sagamusix.de/other/Saga%20Musix%20-%20Colors%20of%20Synth1%20v1.0.zip > /n > https://sagamusix.dehttps://sagamusix.de/sample_collection/bass.zip > /n > https://sagamusix.dehttps://sagamusix.de/sample_collection/bass_drums.zip > /n > https://sagamusix.dehttps://sagamusix.de/sample_collection/drums.zip > /n > https://sagamusix.dehttps://sagamusix.de/sample_collection/fx.zip > /n > https://sagamusix.dehttps://sagamusix.de/sample_collection/pads_strings.zip > /n > https://sagamusix.dehttps://sagamusix.de/sample_collection/powerchords.zip > /n > https://sagamusix.dehttps://sagamusix.de/sample_collection/synths.zip > /n > https://sagamusix.dehttps://sagamusix.de/sample_collection/tr-808.zip > /n > https://sagamusix.dehttps://sagamusix.de/sample_collection/tr-909.zip > /n > Saga%20Musix%20-%20Colors%20of%20Synth1%20v1.0.zip > > Trying to reach https://sagamusix.dehttps:// > sagamusix.de/other/Saga%20Musix%20-%20Colors%20of%20Synth1%20v1.0.zip
Look at the output above: there are two URLs glued together. You only want https://sagamusix.de/other/Saga%20Musix%20-%20Colors%20of%20Synth1%20v1.0.zip but your attempt to remvoe the extra "https://sagamusix.de" > zipfile = line[24:] here > #Removes the last 4 characters to remove the .zip > zipfile2 = zipfile[:3] > print "Trying to reach " + ziplink is then ignored when you use the complete line below: > response = urllib2.urlopen(ziplink) Generally speaking your script is too complex. Start with something really simple, and later add error-handling as needed. Why would you prepend the site in the first place? Something like def download(ziplink, targetdirectory): filename = os.path.join(targetdirectory, posixpath.basename(ziplink)) print "downloading", ziplink print "--> ", filename urllib.urlretrieve(ziplink, filename) if __name__ == "__main__": targetdirectory = "test" os.mkdir(targetdirectory) page = urllib2.urlopen('https://sagamusix.de/en/samples/').read() soup = BeautifulSoup(page, "html5lib") for anchor in soup.findAll('a', href=True): link = anchor['href'] if link.endswith(".zip"): download(link, targetdirectory) should work and is perfectly fine for a throwaway script. Unrelated, but likely to trip you elsewhere: > if line.find('/'): >>> if "foo".find("/"): print "yes" ... yes >>> if "/foo".find("/"): print "yes" ... >>> i. e. str.find() is falsey only if the searched string begins with the search token. That's because find() returns the position (and 0 is a posibble position as indices in Python start with 0) and -1 (which is truthy) to indicate that the search token was not found. What you want is the `in` operator. >>> "/" in "foo" False >>> "/" in "/foo" True > We failed to reach a server.https://sagamusix.dehttps:// > sagamusix.de/other/Saga%20Musix%20-%20Colors%20of%20Synth1%20v1.0.zip > > Reason: [Errno 11001] getaddrinfo failed > bass.zip > > please help me to fix so that i acn download all the zip files > > code: > > import urllib2 > from urllib2 import Request, urlopen, URLError > #import urllib > import os > from bs4 import BeautifulSoup > # import socket > # socket.getaddrinfo('localhost', 8080) > > #Create a new directory to put the files into > #Get the current working directory and create a new directory in it named > #test > cwd = os.getcwd() > newdir = cwd +"\\test" > print "The current Working directory is " + cwd > os.mkdir( newdir); > print "Created new directory " + newdir > newfile = open('zipfiles.txt','w') > print newfile > > > print "Running script.. " > #Set variable for page to be open and url to be concatenated > url = "https://sagamusix.de" > page = urllib2.urlopen('https://sagamusix.de/en/samples/').read() > > #File extension to be looked for. > extension = ".zip" > > #Use BeautifulSoup to clean up the page > soup = BeautifulSoup(page, "html5lib") > soup.prettify() > > #Find all the links on the page that end in .zip > for anchor in soup.findAll('a', href=True): > links = url + anchor['href'] > if links.endswith(extension): > newfile.write(links + '\n') > newfile.close() > > #Read what is saved in zipfiles.txt and output it to the user > #This is done to create presistent data > newfile = open('zipfiles.txt', 'r') > for line in newfile: > print line + '/n' > newfile.close() > > #Read through the lines in the text file and download the zip files. > #Handle exceptions and print exceptions to the console > with open('zipfiles.txt', 'r') as url: > for line in url: > if line.find('/'): > print line.rsplit('/', 1)[1] > > try: > ziplink = line > #Removes the first 48 characters of the url to get the > name of the file > zipfile = line[24:] > #Removes the last 4 characters to remove the .zip > zipfile2 = zipfile[:3] > print "Trying to reach " + ziplink > response = urllib2.urlopen(ziplink) > except URLError as e: > > print 'We failed to reach a server.'+ziplink > if hasattr(e, 'reason'): > print 'Reason: ', e.reason > continue > elif hasattr(e, 'code'): > print 'The server couldnt fulfill the request.' > print 'Error code: ', e.code > continue > else: > zipcontent = response.read() > completeName = os.path.join(newdir, zipfile2+ ".zip") > with open (completeName, 'w') as f: > print "downloading.. " + zipfile > f.write(zipcontent) > f.close() > print "Script completed" > _______________________________________________ > Tutor maillist - [email protected] > To unsubscribe or change subscription options: > https://mail.python.org/mailman/listinfo/tutor _______________________________________________ Tutor maillist - [email protected] To unsubscribe or change subscription options: https://mail.python.org/mailman/listinfo/tutor
