http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html new file mode 100644 index 0000000..d6f33a3 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html @@ -0,0 +1,91 @@ +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + You under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<!DOCTYPE html> +<html xmlns:wicket="http://wicket.apache.org"> +<head> +<meta charset="utf-8" /> +<title>Wicket extend</title> +</head> + +<body> + <wicket:extend> + <h2> + <wicket:message key="page.header.seedList">Seed list</wicket:message> + </h2> + + <div class="row"> + <div class="col-lg-8"> + <form class="form-horizontal" wicket:id="seedList"> + <fieldset> + <!-- Text input--> + <div class="form-group"> + <label class="col-md-4 control-label" for="textinput">Seed list name</label> + <div class="col-md-4"> + <input wicket:id="name" name="textinput" class="form-control input-md" type="text"> + </div> + </div> + <div class="form-group"> + <div class="col-md-offset-4 col-md-4"> + <button type="submit" class="btn btn-primary">Save</button> + </div> + </div> + </fieldset> + </form> + <h3>Seed urls</h3> + <table class="table table-hover table-striped tablesorter"> + <thead> + <tr> + <th class="header col-md-3">Url</th> + <th></th> + </tr> + </thead> + + <tbody wicket:id="seedUrlsTable"> + <tr wicket:id="seedUrls"> + <td> + <span wicket:id="url">http://google.com</span> + </td> + <td> + <button wicket:id="delete" class="btn btn-sm btn-danger" type="button"> + <span class="fa fa-trash-o"></span> + </button> + </td> + </tr> + </tbody> + </table> + <form class="form-horizontal" wicket:id="urlForm"> + <fieldset> + <div class="form-group"> + <div class="col-md-4"> + <input wicket:id="url" name="textinput" class="form-control input-md" type="text"> + </div> + <div> + <button wicket:id="addUrl" class="btn btn-primary">Add url</button> + </div> + </div> + </fieldset> + </form> + </div> + <div class="col-lg-4"> + <div class="panel panel-primary"> + <div class="panel-heading"> + <h3 class="panel-title">Help</h3> + </div> + <div class="panel-body"> + <p>Some help about seed management</p> + </div> + </div> + </div> + </div> + <!--row--> + </wicket:extend> +</body> +</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java new file mode 100644 index 0000000..fba07ab --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java @@ -0,0 +1,153 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.pages.seed; + +import java.util.Iterator; + +import org.apache.nutch.webui.model.SeedList; +import org.apache.nutch.webui.model.SeedUrl; +import org.apache.nutch.webui.pages.AbstractBasePage; +import org.apache.nutch.webui.pages.components.CpmIteratorAdapter; +import org.apache.nutch.webui.service.SeedListService; +import org.apache.wicket.ajax.AjaxRequestTarget; +import org.apache.wicket.ajax.markup.html.AjaxLink; +import org.apache.wicket.ajax.markup.html.form.AjaxSubmitLink; +import org.apache.wicket.markup.html.WebMarkupContainer; +import org.apache.wicket.markup.html.basic.Label; +import org.apache.wicket.markup.html.form.Form; +import org.apache.wicket.markup.html.form.TextField; +import org.apache.wicket.markup.repeater.Item; +import org.apache.wicket.markup.repeater.RefreshingView; +import org.apache.wicket.model.CompoundPropertyModel; +import org.apache.wicket.model.IModel; +import org.apache.wicket.model.LoadableDetachableModel; +import org.apache.wicket.model.Model; +import org.apache.wicket.request.mapper.parameter.PageParameters; +import org.apache.wicket.spring.injection.annot.SpringBean; + +import com.google.common.collect.Lists; + +/** + * This page is for seed urls management + * + * @author feodor + * + */ +public class SeedPage extends AbstractBasePage<SeedList> { + + @SpringBean + private SeedListService seedListService; + + private Form<SeedUrl> urlForm; + + private WebMarkupContainer seedUrlsTable; + + public SeedPage() { + SeedList list = new SeedList(); + list.setSeedUrls(Lists.<SeedUrl> newArrayList()); + initPage(Model.of(list)); + } + + public SeedPage(final PageParameters parameters) { + initPage(new LoadableDetachableModel<SeedList>() { + + @Override + protected SeedList load() { + Long seedListId = parameters.get("id").toLongObject(); + return seedListService.getSeedList(seedListId); + } + }); + } + + public void initPage(IModel<SeedList> model) { + setModel(new CompoundPropertyModel<SeedList>(model)); + + addBaseForm(); + addSeedUrlsList(); + addUrlForm(); + } + + private void addBaseForm() { + Form<SeedList> form = new Form<SeedList>("seedList", getModel()) { + @Override + protected void onSubmit() { + seedListService.save(getModelObject()); + setResponsePage(SeedListsPage.class); + } + }; + form.add(new TextField<String>("name")); + add(form); + } + + private void addSeedUrlsList() { + seedUrlsTable = new WebMarkupContainer("seedUrlsTable"); + seedUrlsTable.setOutputMarkupId(true); + + RefreshingView<SeedUrl> seedUrls = new RefreshingView<SeedUrl>("seedUrls") { + + @Override + protected Iterator<IModel<SeedUrl>> getItemModels() { + return new CpmIteratorAdapter<SeedUrl>(getModelObject().getSeedUrls()); + } + + @Override + protected void populateItem(Item<SeedUrl> item) { + item.add(new Label("url")); + item.add(new AjaxLink<SeedUrl>("delete", item.getModel()) { + + @Override + public void onClick(AjaxRequestTarget target) { + deleteSeedUrl(getModelObject()); + target.add(seedUrlsTable); + } + }); + } + }; + seedUrlsTable.add(seedUrls); + add(seedUrlsTable); + } + + private void addUrlForm() { + urlForm = new Form<SeedUrl>("urlForm", CompoundPropertyModel.of(Model + .of(new SeedUrl()))); + urlForm.setOutputMarkupId(true); + urlForm.add(new TextField<String>("url")); + urlForm.add(new AjaxSubmitLink("addUrl", urlForm) { + @Override + protected void onSubmit(AjaxRequestTarget target, Form<?> form) { + addSeedUrl(); + urlForm.setModelObject(new SeedUrl()); + target.add(urlForm); + target.add(seedUrlsTable); + } + }); + add(urlForm); + } + + private void addSeedUrl() { + SeedUrl url = urlForm.getModelObject(); + SeedList seedList = getModelObject(); + url.setSeedList(seedList); + seedList.getSeedUrls().add(url); + } + + private void deleteSeedUrl(SeedUrl url) { + SeedList seedList = getModelObject(); + seedList.getSeedUrls().remove(url); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html new file mode 100644 index 0000000..8810371 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html @@ -0,0 +1,43 @@ +<!DOCTYPE html> +<html xmlns:wicket="http://wicket.apache.org"> +<head> +<meta charset="utf-8" /> +<title>Wicket extend</title> +</head> + +<body> + <wicket:extend> + <h2> + <wicket:message key="settings">Settings</wicket:message> + </h2> + <div class="row"> + <div class="col-lg-12"> + <table class="table table-hover tablesorter table-bordered"> + <thead> + <tr> + <th class="header col-lg-3"> + <wicket:message key="settings.header.name">Name</wicket:message> + </th> + <th class="header col-lg-9"> + <wicket:message key="settings.header.value">Value</wicket:message> + </th> + </tr> + </thead> + <tbody wicket:id="settingsTable"> + <tr wicket:id="settings"> + <td> + <span wicket:id="name">Name</span> + </td> + <td> +<!-- <span wicket:id="value">Value</span> --> + <input class="col-lg-12" wicket:id="value" placeholder="http://localhost:8080"> + </td> + </tr> + </tbody> + </table> + </div> + </div> + </wicket:extend> + +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java new file mode 100644 index 0000000..29e46f7 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java @@ -0,0 +1,59 @@ +package org.apache.nutch.webui.pages.settings; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.nutch.webui.model.NutchConfig; +import org.apache.nutch.webui.pages.AbstractBasePage; +import org.apache.nutch.webui.pages.components.CpmIteratorAdapter; +import org.apache.nutch.webui.service.NutchService; +import org.apache.wicket.markup.html.WebMarkupContainer; +import org.apache.wicket.markup.html.basic.Label; +import org.apache.wicket.markup.html.form.TextField; +import org.apache.wicket.markup.repeater.Item; +import org.apache.wicket.markup.repeater.RefreshingView; +import org.apache.wicket.model.IModel; +import org.apache.wicket.spring.injection.annot.SpringBean; + +public class SettingsPage extends AbstractBasePage<Void> { + @SpringBean + private NutchService nutchService; + + private WebMarkupContainer settingsTable; + + public SettingsPage() { + settingsTable = new WebMarkupContainer("settingsTable"); + settingsTable.setOutputMarkupId(true); + RefreshingView<NutchConfig> nutchConfig = new RefreshingView<NutchConfig>( + "settings") { + + @Override + protected Iterator<IModel<NutchConfig>> getItemModels() { + return new CpmIteratorAdapter<NutchConfig>( + convertNutchConfig(nutchService.getNutchConfig(getCurrentInstance() + .getId()))); + } + + @Override + protected void populateItem(Item<NutchConfig> item) { + item.add(new Label("name")); + item.add(new TextField<String>("value")); + } + }; + settingsTable.add(nutchConfig); + add(settingsTable); + } + + private List<NutchConfig> convertNutchConfig(Map<String, String> map) { + List<NutchConfig> listNutchConfigs = new LinkedList<NutchConfig>(); + for (String key : map.keySet()) { + NutchConfig conf = new NutchConfig(); + conf.setName(key); + conf.setValue(map.get(key)); + listNutchConfigs.add(conf); + } + return listNutchConfigs; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java new file mode 100644 index 0000000..c742b48 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.service; + +import java.util.List; + +import org.apache.nutch.webui.client.model.Crawl; +import org.apache.nutch.webui.model.NutchInstance; + +public interface CrawlService { + + public void saveCrawl(Crawl crawl); + + public List<Crawl> getCrawls(); + + void startCrawl(Long crawlId, NutchInstance instance); + + void deleteCrawl(Long crawlId); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java new file mode 100644 index 0000000..23f27e8 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.service; + +import java.util.List; + +import org.apache.nutch.webui.model.NutchInstance; + +public interface NutchInstanceService { + + public List<NutchInstance> getInstances(); + + public void saveInstance(NutchInstance instance); + + public void removeInstance(Long id); + + public NutchInstance getInstance(Long id); + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java new file mode 100644 index 0000000..643236a --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.service; + +import java.util.Map; + +import org.apache.nutch.webui.client.model.ConnectionStatus; +import org.apache.nutch.webui.client.model.NutchStatus; + +public interface NutchService { + public ConnectionStatus getConnectionStatus(Long instanceId); + + public Map<String, String> getNutchConfig(Long instanceId); + + public NutchStatus getNutchStatus(Long instanceId); + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java new file mode 100644 index 0000000..dda8c71 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.service; + +import java.util.List; + +import org.apache.nutch.webui.model.SeedList; + +public interface SeedListService { + + public void save(SeedList seedList); + + public void delete(Long seedListId); + + public List<SeedList> findAll(); + + public SeedList getSeedList(Long seedListId); + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java new file mode 100644 index 0000000..7bb133b --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.service.impl; + +import java.sql.SQLException; +import java.util.List; + +import javax.annotation.Resource; + +import org.apache.nutch.webui.client.NutchClient; +import org.apache.nutch.webui.client.NutchClientFactory; +import org.apache.nutch.webui.client.impl.CrawlingCycle; +import org.apache.nutch.webui.client.impl.RemoteCommandsBatchFactory; +import org.apache.nutch.webui.client.impl.CrawlingCycleListener; +import org.apache.nutch.webui.client.impl.RemoteCommand; +import org.apache.nutch.webui.client.impl.RemoteCommandExecutor; +import org.apache.nutch.webui.client.model.Crawl; +import org.apache.nutch.webui.client.model.Crawl.CrawlStatus; +import org.apache.nutch.webui.model.NutchInstance; +import org.apache.nutch.webui.service.CrawlService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import com.j256.ormlite.dao.Dao; + +@Service +public class CrawlServiceImpl implements CrawlService, CrawlingCycleListener { + private Logger log = LoggerFactory.getLogger(CrawlServiceImpl.class); + + @Resource + private Dao<Crawl, Long> crawlDao; + + @Resource + private NutchClientFactory nutchClientFactory; + + @Resource + private RemoteCommandsBatchFactory commandFactory; + + @Override + @Async + public void startCrawl(Long crawlId, NutchInstance instance) { + Crawl crawl = null; + try { + crawl = crawlDao.queryForId(crawlId); + if(crawl.getCrawlId()==null) { + crawl.setCrawlId("crawl-" + crawlId.toString()); + } + NutchClient client = nutchClientFactory.getClient(instance); + String seedDirectory = client.createSeed(crawl.getSeedList()); + crawl.setSeedDirectory(seedDirectory); + + List<RemoteCommand> commands = commandFactory.createCommands(crawl); + RemoteCommandExecutor executor = new RemoteCommandExecutor(client); + + CrawlingCycle cycle = new CrawlingCycle(this, executor, crawl, commands); + cycle.executeCrawlCycle(); + + } catch (Exception e) { + crawl.setStatus(CrawlStatus.ERROR); + saveCrawl(crawl); + log.error("exception occured", e); + } + } + + @Override + public List<Crawl> getCrawls() { + try { + return crawlDao.queryForAll(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public void saveCrawl(Crawl crawl) { + try { + crawlDao.createOrUpdate(crawl); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public void deleteCrawl(Long crawlId) { + try { + crawlDao.deleteById(crawlId); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public void crawlingStarted(Crawl crawl) { + crawl.setStatus(CrawlStatus.CRAWLING); + crawl.setProgress(0); + saveCrawl(crawl); + } + + @Override + public void onCrawlError(Crawl crawl, String msg) { + crawl.setStatus(CrawlStatus.ERROR); + saveCrawl(crawl); + } + + @Override + public void commandExecuted(Crawl crawl, RemoteCommand command, int progress) { + crawl.setProgress(progress); + saveCrawl(crawl); + } + + @Override + public void crawlingFinished(Crawl crawl) { + crawl.setStatus(CrawlStatus.FINISHED); + saveCrawl(crawl); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java new file mode 100644 index 0000000..e100054 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.service.impl; + +import java.sql.SQLException; +import java.util.List; + +import javax.annotation.Resource; + +import org.apache.nutch.webui.client.NutchClientFactory; +import org.apache.nutch.webui.model.NutchInstance; +import org.apache.nutch.webui.service.NutchInstanceService; +import org.springframework.stereotype.Service; + +import com.j256.ormlite.dao.Dao; + +@Service +public class NutchInstanceServiceImpl implements NutchInstanceService { + + @Resource + private NutchClientFactory nutchClientFactory; + + @Resource + private Dao<NutchInstance, Long> instancesDao; + + @Override + public List<NutchInstance> getInstances() { + try { + return instancesDao.queryForAll(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + + } + + @Override + public NutchInstance getInstance(Long id) { + try { + return instancesDao.queryForId(id); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public void saveInstance(NutchInstance instance) { + try { + instancesDao.createOrUpdate(instance); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public void removeInstance(Long id) { + try { + instancesDao.deleteById(id); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java new file mode 100644 index 0000000..db989cf --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.service.impl; + +import java.net.ConnectException; +import java.util.Collections; +import java.util.Map; + +import javax.annotation.Resource; + +import org.apache.nutch.webui.client.NutchClientFactory; +import org.apache.nutch.webui.client.model.ConnectionStatus; +import org.apache.nutch.webui.client.model.NutchStatus; +import org.apache.nutch.webui.model.NutchInstance; +import org.apache.nutch.webui.service.NutchInstanceService; +import org.apache.nutch.webui.service.NutchService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Service; + +import com.sun.jersey.api.client.ClientHandlerException; + +@Service +public class NutchServiceImpl implements NutchService { + private static final Logger logger = LoggerFactory + .getLogger(NutchServiceImpl.class); + + @Resource + private NutchClientFactory nutchClientFactory; + + @Resource + private NutchInstanceService instanceService; + + @Override + public ConnectionStatus getConnectionStatus(Long instanceId) { + NutchInstance instance = instanceService.getInstance(instanceId); + try { + NutchStatus nutchStatus = nutchClientFactory.getClient(instance) + .getNutchStatus(); + if (nutchStatus.getStartDate() != null) { + return ConnectionStatus.CONNECTED; + } + } catch (Exception e) { + if (e.getCause() instanceof ConnectException) { + return ConnectionStatus.DISCONNECTED; + } + + logger.error("Cannot connect to nutch server!", e); + } + return null; + } + + @Override + public Map<String, String> getNutchConfig(Long instanceId) { + NutchInstance instance = instanceService.getInstance(instanceId); + try { + return nutchClientFactory.getClient(instance).getNutchConfig("default"); + } catch (ClientHandlerException exception) { + return Collections.emptyMap(); + } + } + + @Override + public NutchStatus getNutchStatus(Long instanceId) { + NutchInstance instance = instanceService.getInstance(instanceId); + return nutchClientFactory.getClient(instance).getNutchStatus(); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java new file mode 100644 index 0000000..fced2d3 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.service.impl; + +import java.sql.SQLException; +import java.util.List; + +import javax.annotation.Resource; + +import org.apache.nutch.webui.model.SeedList; +import org.apache.nutch.webui.model.SeedUrl; +import org.apache.nutch.webui.service.SeedListService; +import org.springframework.stereotype.Service; + +import com.j256.ormlite.dao.Dao; + +@Service +public class SeedListServiceImpl implements SeedListService { + + @Resource + private Dao<SeedList, Long> seedListDao; + + @Resource + private Dao<SeedUrl, Long> seedUrlDao; + + @Override + public void save(SeedList seedList) { + try { + seedListDao.createOrUpdate(seedList); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public void delete(Long seedListId) { + try { + seedListDao.deleteById(seedListId); + } catch (SQLException e) { + throw new RuntimeException(e); + } + + } + + @Override + public List<SeedList> findAll() { + try { + return seedListDao.queryForAll(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public SeedList getSeedList(Long seedListId) { + try { + return seedListDao.queryForId(seedListId); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/overview.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/overview.html b/nutch-core/src/main/java/overview.html new file mode 100644 index 0000000..1132141 --- /dev/null +++ b/nutch-core/src/main/java/overview.html @@ -0,0 +1,9 @@ +<html> +<head> + <title>Apache Nutch</title> +</head> +<body> +<p>Apache Nutch is a highly extensible and scalable open source web crawler software project.</p> +<p>Nutch is a project of the Apache Software Foundation and is part of the larger Apache community of developers and users.</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java new file mode 100644 index 0000000..bb938a6 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java @@ -0,0 +1,270 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.List; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.nutch.crawl.CrawlDbUpdateUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.TimingUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Emulate a continuous crawl for one URL. + * + */ +public class ContinuousCrawlTestUtil extends TestCase { + + private static final Logger LOG = LoggerFactory + .getLogger(ContinuousCrawlTestUtil.class); + + protected static Text dummyURL = new Text("http://nutch.apache.org/"); + + protected static Configuration defaultConfig = CrawlDBTestUtil + .createConfiguration(); + + protected long interval = FetchSchedule.SECONDS_PER_DAY * 1000; // (default) + // launch + // crawler + // every day + protected long duration = 2 * 365L * FetchSchedule.SECONDS_PER_DAY * 1000L; // run + // for + // two + // years + + protected Configuration configuration; + private FetchSchedule schedule; + + /** status a fetched datum should get */ + protected byte fetchStatus = CrawlDatum.STATUS_FETCH_SUCCESS; + /** expected status of the resulting Db datum */ + protected byte expectedDbStatus = CrawlDatum.STATUS_DB_FETCHED; + + /** for signature calculation */ + protected Signature signatureImpl; + protected Content content = new Content(); + + { + byte[] data = { 'n', 'u', 't', 'c', 'h' }; + content.setContent(data); + } + + protected ContinuousCrawlTestUtil(Configuration conf) { + configuration = conf; + schedule = FetchScheduleFactory.getFetchSchedule(new JobConf(conf)); + signatureImpl = SignatureFactory.getSignature(conf); + } + + protected ContinuousCrawlTestUtil(Configuration conf, byte fetchStatus, + byte expectedDbStatus) { + this(conf); + this.fetchStatus = fetchStatus; + this.expectedDbStatus = expectedDbStatus; + } + + protected ContinuousCrawlTestUtil() { + this(defaultConfig); + } + + protected ContinuousCrawlTestUtil(byte fetchStatus, byte expectedDbStatus) { + this(defaultConfig, fetchStatus, expectedDbStatus); + } + + /** set the interval the crawl is relaunched (default: every day) */ + protected void setInterval(int seconds) { + interval = seconds * 1000L; + } + + /** set the duration of the continuous crawl (default = 2 years) */ + protected void setDuraction(int seconds) { + duration = seconds * 1000L; + } + + /** + * default fetch action: set status and time + * + * @param datum + * CrawlDatum to fetch + * @param currentTime + * current time used to set the fetch time via + * {@link CrawlDatum#setFetchTime(long)} + * @return the modified CrawlDatum + */ + protected CrawlDatum fetch(CrawlDatum datum, long currentTime) { + datum.setStatus(fetchStatus); + datum.setFetchTime(currentTime); + return datum; + } + + /** + * get signature for content and configured signature implementation + */ + protected byte[] getSignature() { + return signatureImpl.calculate(content, null); + } + + /** + * change content to force a changed signature + */ + protected void changeContent() { + byte[] data = Arrays.copyOf(content.getContent(), + content.getContent().length + 1); + data[content.getContent().length] = '2'; // append one byte + content.setContent(data); + LOG.info("document content changed"); + } + + /** + * default parse action: add signature if successfully fetched + * + * @param fetchDatum + * fetch datum + * @return list of all datums resulting from parse (status: signature, linked, + * parse_metadata) + */ + protected List<CrawlDatum> parse(CrawlDatum fetchDatum) { + List<CrawlDatum> parseDatums = new ArrayList<CrawlDatum>(0); + if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) { + CrawlDatum signatureDatum = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0); + signatureDatum.setSignature(getSignature()); + parseDatums.add(signatureDatum); + } + return parseDatums; + } + + /** + * default implementation to check the result state + * + * @param datum + * the CrawlDatum to be checked + * @return true if the check succeeds + */ + protected boolean check(CrawlDatum datum) { + if (datum.getStatus() != expectedDbStatus) + return false; + return true; + } + + /** + * Run the continuous crawl. + * <p> + * A loop emulates a continuous crawl launched in regular intervals (see + * {@link #setInterval(int)} over a longer period ({@link #setDuraction(int)}. + * + * <ul> + * <li>every "round" emulates + * <ul> + * <li>a fetch (see {@link #fetch(CrawlDatum, long)})</li> + * <li>{@literal updatedb} which returns a {@link CrawlDatum}</li> + * </ul> + * <li>the returned CrawlDatum is used as input for the next round</li> + * <li>and is checked whether it is correct (see {@link #check(CrawlDatum)}) + * </ul> + * </p> + * + * @param maxErrors + * (if > 0) continue crawl even if the checked CrawlDatum is not + * correct, but stop after max. number of errors + * + * @return false if a check of CrawlDatum failed, true otherwise + */ + protected boolean run(int maxErrors) { + + long now = System.currentTimeMillis(); + + CrawlDbUpdateUtil<CrawlDbReducer> updateDb = new CrawlDbUpdateUtil<CrawlDbReducer>( + new CrawlDbReducer(), configuration); + + /* start with a db_unfetched */ + CrawlDatum dbDatum = new CrawlDatum(); + dbDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); + schedule.initializeSchedule(dummyURL, dbDatum); // initialize fetchInterval + dbDatum.setFetchTime(now); + + LOG.info("Emulate a continuous crawl, launched every " + + (interval / (FetchSchedule.SECONDS_PER_DAY * 1000)) + " day (" + + (interval / 1000) + " seconds)"); + long maxTime = (now + duration); + long nextTime = now; + long lastFetchTime = -1; + boolean ok = true; // record failure but keep going + CrawlDatum fetchDatum = new CrawlDatum(); + /* + * Keep copies because CrawlDbReducer.reduce() and + * FetchSchedule.shouldFetch() may alter the references. Copies are used for + * verbose logging in case of an error. + */ + CrawlDatum copyDbDatum = new CrawlDatum(); + CrawlDatum copyFetchDatum = new CrawlDatum(); + CrawlDatum afterShouldFetch = new CrawlDatum(); + int errorCount = 0; + while (nextTime < maxTime) { + LOG.info("check: " + new Date(nextTime)); + fetchDatum.set(dbDatum); + copyDbDatum.set(dbDatum); + if (schedule.shouldFetch(dummyURL, fetchDatum, nextTime)) { + LOG.info("... fetching now (" + new Date(nextTime) + ")"); + if (lastFetchTime > -1) { + LOG.info("(last fetch: " + new Date(lastFetchTime) + " = " + + TimingUtil.elapsedTime(lastFetchTime, nextTime) + " ago)"); + } + lastFetchTime = nextTime; + afterShouldFetch.set(fetchDatum); + fetchDatum = fetch(fetchDatum, nextTime); + copyFetchDatum.set(fetchDatum); + List<CrawlDatum> values = new ArrayList<CrawlDatum>(); + values.add(dbDatum); + values.add(fetchDatum); + values.addAll(parse(fetchDatum)); + List<CrawlDatum> res = updateDb.update(values); + assertNotNull("null returned", res); + assertFalse("no CrawlDatum", 0 == res.size()); + assertEquals("more than one CrawlDatum", 1, res.size()); + if (!check(res.get(0))) { + LOG.info("previously in CrawlDb: " + copyDbDatum); + LOG.info("after shouldFetch(): " + afterShouldFetch); + LOG.info("fetch: " + fetchDatum); + LOG.warn("wrong result in CrawlDb: " + res.get(0)); + if (++errorCount >= maxErrors) { + if (maxErrors > 0) { + LOG.error("Max. number of errors " + maxErrors + + " reached. Stopping."); + } + return false; + } else { + ok = false; // record failure but keep going + } + } + /* use the returned CrawlDatum for the next fetch */ + dbDatum = res.get(0); + } + nextTime += interval; + } + return ok; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java new file mode 100644 index 0000000..56905e4 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.MapFile.Writer.Option; +import org.apache.hadoop.io.Text; +import org.mortbay.jetty.Server; +import org.mortbay.jetty.bio.SocketConnector; +import org.mortbay.jetty.handler.ContextHandler; +import org.mortbay.jetty.handler.ResourceHandler; + +public class CrawlDBTestUtil { + + private static final Logger LOG = LoggerFactory + .getLogger(CrawlDBTestUtil.class); + + /** + * Creates synthetic crawldb + * + * @param fs + * filesystem where db will be created + * @param crawldb + * path were db will be created + * @param init + * urls to be inserted, objects are of type URLCrawlDatum + * @throws Exception + */ + public static void createCrawlDb(Configuration conf, FileSystem fs, + Path crawldb, List<URLCrawlDatum> init) throws Exception { + LOG.trace("* creating crawldb: " + crawldb); + Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME); + Option wKeyOpt = MapFile.Writer.keyClass(Text.class); + org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class); + MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir, + "part-r-00000"), wKeyOpt, wValueOpt); + Iterator<URLCrawlDatum> it = init.iterator(); + while (it.hasNext()) { + URLCrawlDatum row = it.next(); + LOG.info("adding:" + row.url.toString()); + writer.append(new Text(row.url), row.datum); + } + writer.close(); + } + + /** + * For now we need to manually construct our Configuration, because we need to + * override the default one and it is currently not possible to use + * dynamically set values. + * + * @return + * @deprecated Use {@link #createConfiguration()} instead + */ + @Deprecated + public static Configuration create() { + return createConfiguration(); + } + + /** + * For now we need to manually construct our Configuration, because we need to + * override the default one and it is currently not possible to use + * dynamically set values. + * + * @return + */ + public static Configuration createConfiguration() { + Configuration conf = new Configuration(); + conf.addResource("nutch-default.xml"); + conf.addResource("crawl-tests.xml"); + return conf; + } + + public static class URLCrawlDatum { + + public Text url; + + public CrawlDatum datum; + + public URLCrawlDatum(Text url, CrawlDatum datum) { + this.url = url; + this.datum = datum; + } + } + + /** + * Generate seedlist + * + * @throws IOException + */ + public static void generateSeedList(FileSystem fs, Path urlPath, + List<String> urls) throws IOException { + generateSeedList(fs, urlPath, urls, new ArrayList<String>()); + } + + /** + * Generate seedlist + * + * @throws IOException + */ + public static void generateSeedList(FileSystem fs, Path urlPath, + List<String> urls, List<String> metadata) throws IOException { + FSDataOutputStream out; + Path file = new Path(urlPath, "urls.txt"); + fs.mkdirs(urlPath); + out = fs.create(file); + + Iterator<String> urls_i = urls.iterator(); + Iterator<String> metadata_i = metadata.iterator(); + + String url; + String md; + while (urls_i.hasNext()) { + url = urls_i.next(); + + out.writeBytes(url); + + if (metadata_i.hasNext()) { + md = metadata_i.next(); + out.writeBytes(md); + } + + out.writeBytes("\n"); + } + + out.flush(); + out.close(); + } + + /** + * Creates a new JettyServer with one static root context + * + * @param port + * port to listen to + * @param staticContent + * folder where static content lives + * @throws UnknownHostException + */ + public static Server getServer(int port, String staticContent) + throws UnknownHostException { + Server webServer = new org.mortbay.jetty.Server(); + SocketConnector listener = new SocketConnector(); + listener.setPort(port); + listener.setHost("127.0.0.1"); + webServer.addConnector(listener); + ContextHandler staticContext = new ContextHandler(); + staticContext.setContextPath("/"); + staticContext.setResourceBase(staticContent); + staticContext.addHandler(new ResourceHandler()); + webServer.addHandler(staticContext); + return webServer; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java new file mode 100644 index 0000000..7238f88 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configuration.IntegerRanges; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.RawComparator; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; +import org.apache.hadoop.mapreduce.Counters; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.JobID; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.OutputCommitter; +import org.apache.hadoop.mapreduce.OutputFormat; +import org.apache.hadoop.mapreduce.Partitioner; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.Reducer.Context; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.hadoop.mapreduce.TaskInputOutputContext; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.mrunit.mapreduce.ReduceDriver; +import org.apache.hadoop.mrunit.types.Pair; + +/** + * Utility to test transitions of {@link CrawlDatum} states during an update of + * {@link CrawlDb} (command {@literal updatedb}): call + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} + * (using MRUnit) with the old CrawlDatum (db status) and the new one (fetch + * status) + */ +public class CrawlDbUpdateTestDriver<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> { + + private static final Logger LOG = LoggerFactory + .getLogger(CrawlDbUpdateTestDriver.class); + + private ReduceDriver<Text, CrawlDatum, Text, CrawlDatum> reduceDriver; + private T reducer; + private Configuration configuration; + + public static Text dummyURL = new Text("http://nutch.apache.org/"); + +// protected CrawlDbUpdateUtilNewAPI(T red, T.Context con) { + protected CrawlDbUpdateTestDriver(T updateReducer, Configuration conf) { + reducer = updateReducer; + configuration = conf; + } + + /** + * run + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} + * and return the CrawlDatum(s) which would have been written into CrawlDb + * + * @param values + * list of input CrawlDatums + * @return list of resulting CrawlDatum(s) in CrawlDb + */ + public List<CrawlDatum> update(List<CrawlDatum> values) { + List<CrawlDatum> result = new ArrayList<CrawlDatum>(0); + if (values == null || values.size() == 0) { + return result; + } + Collections.shuffle(values); // sorting of values should have no influence + reduceDriver = ReduceDriver.newReduceDriver(reducer); + reduceDriver.setConfiguration(configuration); + reduceDriver.withInput(dummyURL, values); + List<Pair<Text,CrawlDatum>> reduceResult; + try { + reduceResult = reduceDriver.run(); + for (Pair<Text,CrawlDatum> p : reduceResult) { + if (p.getFirst().equals(dummyURL)) { + result.add(p.getSecond()); + } + } + } catch (IOException e) { + LOG.error(StringUtils.stringifyException(e)); + return result; + } + return result; + } + + /** + * run + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} + * and return the CrawlDatum(s) which would have been written into CrawlDb + * + * @param dbDatum + * previous CrawlDatum in CrawlDb + * @param fetchDatum + * CrawlDatum resulting from fetching + * @return list of resulting CrawlDatum(s) in CrawlDb + */ + public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) { + List<CrawlDatum> values = new ArrayList<CrawlDatum>(); + if (dbDatum != null) + values.add(dbDatum); + if (fetchDatum != null) + values.add(fetchDatum); + return update(values); + } + + /** + * see {@link #update(List)} + */ + public List<CrawlDatum> update(CrawlDatum... values) { + return update(Arrays.asList(values)); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java new file mode 100644 index 0000000..bfb716d --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java @@ -0,0 +1,166 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.Counters.Counter; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility to test transitions of {@link CrawlDatum} states during an update of + * {@link CrawlDb} (command {@literal updatedb}): call + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} with + * the old CrawlDatum (db status) and the new one (fetch status) + */ +public class CrawlDbUpdateUtil<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> { + + private static final Logger LOG = LoggerFactory + .getLogger(CrawlDbUpdateUtil.class); + + private T reducer; + + public static Text dummyURL = new Text("http://nutch.apache.org/"); + + protected CrawlDbUpdateUtil(T red, Configuration conf) { + reducer = red; + reducer.configure(new JobConf(conf)); + } + + /** {@link OutputCollector} to collect all values in a {@link List} */ + private class ListOutputCollector implements + OutputCollector<Text, CrawlDatum> { + + private List<CrawlDatum> values = new ArrayList<CrawlDatum>(); + + public void collect(Text key, CrawlDatum value) throws IOException { + values.add(value); + } + + /** collected values as list */ + public List<CrawlDatum> getValues() { + return values; + } + + } + + /** + * Dummy reporter which does nothing and does not return null for getCounter() + * + * @see {@link Reporter#NULL} + */ + private class DummyReporter implements Reporter { + + private Counters dummyCounters = new Counters(); + + public void progress() { + } + + public Counter getCounter(Enum<?> arg0) { + return dummyCounters.getGroup("dummy").getCounterForName("dummy"); + } + + public Counter getCounter(String arg0, String arg1) { + return dummyCounters.getGroup("dummy").getCounterForName("dummy"); + } + + public InputSplit getInputSplit() throws UnsupportedOperationException { + throw new UnsupportedOperationException("Dummy reporter without input"); + } + + public void incrCounter(Enum<?> arg0, long arg1) { + } + + public void incrCounter(String arg0, String arg1, long arg2) { + } + + public void setStatus(String arg0) { + } + + public float getProgress() { + return 1f; + } + + } + + /** + * run + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} + * and return the CrawlDatum(s) which would have been written into CrawlDb + * + * @param values + * list of input CrawlDatums + * @return list of resulting CrawlDatum(s) in CrawlDb + */ + public List<CrawlDatum> update(List<CrawlDatum> values) { + if (values == null || values.size() == 0) { + return new ArrayList<CrawlDatum>(0); + } + Collections.shuffle(values); // sorting of values should have no influence + ListOutputCollector output = new ListOutputCollector(); + try { + reducer.reduce(dummyURL, values.iterator(), output, new DummyReporter()); + } catch (IOException e) { + LOG.error(StringUtils.stringifyException(e)); + } + return output.getValues(); + } + + /** + * run + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} + * and return the CrawlDatum(s) which would have been written into CrawlDb + * + * @param dbDatum + * previous CrawlDatum in CrawlDb + * @param fetchDatum + * CrawlDatum resulting from fetching + * @return list of resulting CrawlDatum(s) in CrawlDb + */ + public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) { + List<CrawlDatum> values = new ArrayList<CrawlDatum>(); + if (dbDatum != null) + values.add(dbDatum); + if (fetchDatum != null) + values.add(fetchDatum); + return update(values); + } + + /** + * see {@link #update(List)} + */ + public List<CrawlDatum> update(CrawlDatum... values) { + return update(Arrays.asList(values)); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java b/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java new file mode 100644 index 0000000..94c27b5 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import org.apache.hadoop.io.IntWritable; + +public class DummyWritable extends IntWritable { + + public DummyWritable() { + + } + + public DummyWritable(int i) { + super(i); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java new file mode 100644 index 0000000..fd88c7d --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java @@ -0,0 +1,171 @@ +package org.apache.nutch.crawl; + +import static org.apache.nutch.crawl.CrawlDatum.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.TimingUtil; + +import static org.junit.Assert.*; + +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Category({ IntegrationTest.class}) +public class TODOTestCrawlDbStates extends TestCrawlDbStates { + + private static final Logger LOG = LoggerFactory + .getLogger(TODOTestCrawlDbStates.class); + + /** + * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max + * is reached. Retry counter has to be reset appropriately. + */ + @Test + public void testCrawlDbReducerPageRetrySchedule() { + LOG.info("NUTCH-578: test long running continuous crawl with fetch_retry"); + ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestFetchRetry(); + // keep going for long, to "provoke" a retry counter overflow + if (!crawlUtil.run(150)) { + fail("fetch_retry did not result in a db_gone if retry counter > maxRetries (NUTCH-578)"); + } + } + + private class ContinuousCrawlTestFetchRetry extends ContinuousCrawlTestUtil { + + private int retryMax = 3; + private int totalRetries = 0; + + ContinuousCrawlTestFetchRetry() { + super(); + fetchStatus = STATUS_FETCH_RETRY; + retryMax = configuration.getInt("db.fetch.retry.max", retryMax); + } + + @Override + protected CrawlDatum fetch(CrawlDatum datum, long currentTime) { + datum.setStatus(fetchStatus); + datum.setFetchTime(currentTime); + totalRetries++; + return datum; + } + + @Override + protected boolean check(CrawlDatum result) { + if (result.getRetriesSinceFetch() > retryMax) { + LOG.warn("Retry counter > db.fetch.retry.max: " + result); + } else if (result.getRetriesSinceFetch() == Byte.MAX_VALUE) { + LOG.warn("Retry counter max. value reached (overflow imminent): " + + result); + } else if (result.getRetriesSinceFetch() < 0) { + LOG.error("Retry counter overflow: " + result); + return false; + } + // use retry counter bound to this class (totalRetries) + // instead of result.getRetriesSinceFetch() because the retry counter + // in CrawlDatum could be reset (eg. NUTCH-578_v5.patch) + if (totalRetries < retryMax) { + if (result.getStatus() == STATUS_DB_UNFETCHED) { + LOG.info("ok: " + result); + result.getRetriesSinceFetch(); + return true; + } + } else { + if (result.getStatus() == STATUS_DB_GONE) { + LOG.info("ok: " + result); + return true; + } + } + LOG.warn("wrong: " + result); + return false; + } + + } + + /** + * NUTCH-1564 AdaptiveFetchSchedule: sync_delta forces immediate re-fetch for + * documents not modified + * <p> + * Problem: documents not modified for a longer time are fetched in every + * cycle because of an error in the SYNC_DELTA calculation of + * {@link AdaptiveFetchSchedule}. <br> + * The next fetch time should always be in the future, never in the past. + * </p> + */ + @Test + public void testAdaptiveFetchScheduleSyncDelta() { + LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule"); + Configuration conf = CrawlDBTestUtil.createConfiguration(); + conf.setLong("db.fetch.interval.default", 172800); // 2 days + conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day + conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days + conf.setLong("db.fetch.interval.max", 604800); // 7 days + conf.set("db.fetch.schedule.class", + "org.apache.nutch.crawl.AdaptiveFetchSchedule"); + ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime( + conf); + crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3); + if (!crawlUtil.run(100)) { + fail("failed: sync_delta calculation with AdaptiveFetchSchedule"); + } + } + + private class CrawlTestFetchScheduleNotModifiedFetchTime extends + CrawlTestFetchNotModified { + + // time of current fetch + private long fetchTime; + + private long minInterval; + private long maxInterval; + + CrawlTestFetchScheduleNotModifiedFetchTime(Configuration conf) { + super(conf); + minInterval = conf.getLong("db.fetch.schedule.adaptive.min_interval", + 86400); // 1 day + maxInterval = conf.getLong("db.fetch.schedule.adaptive.max_interval", + 604800); // 7 days + if (conf.getLong("db.fetch.interval.max", 604800) < maxInterval) { + maxInterval = conf.getLong("db.fetch.interval.max", 604800); + } + } + + @Override + protected CrawlDatum fetch(CrawlDatum datum, long currentTime) { + // remember time of fetching + fetchTime = currentTime; + return super.fetch(datum, currentTime); + } + + @Override + protected boolean check(CrawlDatum result) { + if (result.getStatus() == STATUS_DB_NOTMODIFIED) { + // check only status notmodified here + long secondsUntilNextFetch = (result.getFetchTime() - fetchTime) / 1000L; + if (secondsUntilNextFetch < -1) { + // next fetch time is in the past (more than one second) + LOG.error("Next fetch time is in the past: " + result); + return false; + } + if (secondsUntilNextFetch < 60) { + // next fetch time is in less than one minute + // (critical: Nutch can hardly be so fast) + LOG.error("Less then one minute until next fetch: " + result); + } + // Next fetch time should be within min. and max. (tolerance: 60 sec.) + if (secondsUntilNextFetch + 60 < minInterval + || secondsUntilNextFetch - 60 > maxInterval) { + LOG.error("Interval until next fetch time (" + + TimingUtil.elapsedTime(fetchTime, result.getFetchTime()) + + ") is not within min. and max. interval: " + result); + // TODO: is this a failure? + } + } + return true; + } + + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java new file mode 100644 index 0000000..3fa798d --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for AdaptiveFetchSchedule. + * + */ +public class TestAdaptiveFetchSchedule extends TestCase { + + private float inc_rate; + private float dec_rate; + private Configuration conf; + private long curTime, lastModified; + private int changed, interval, calculateInterval; + + @Before + public void setUp() throws Exception { + super.setUp(); + conf = NutchConfiguration.create(); + inc_rate = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f); + dec_rate = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f); + interval = 100; + lastModified = 0; + } + + /** + * Test the core functionality of AdaptiveFetchSchedule. + * + */ + + @Test + public void testAdaptiveFetchSchedule() { + + FetchSchedule fs = new AdaptiveFetchSchedule(); + fs.setConf(conf); + + CrawlDatum p = prepareCrawlDatum(); + Text url = new Text("http://www.example.com"); + + changed = FetchSchedule.STATUS_UNKNOWN; + fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime, + lastModified, changed); + validateFetchInterval(changed, p.getFetchInterval()); + + changed = FetchSchedule.STATUS_MODIFIED; + fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime, + lastModified, changed); + validateFetchInterval(changed, p.getFetchInterval()); + p.setFetchInterval(interval); + + changed = FetchSchedule.STATUS_NOTMODIFIED; + fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime, + lastModified, changed); + validateFetchInterval(changed, p.getFetchInterval()); + + } + + /** + * Prepare a CrawlDatum (STATUS_DB_UNFETCHED) to Test AdaptiveFetchSchedule. + * + * @return properly initialized CrawlDatum + */ + public CrawlDatum prepareCrawlDatum() { + CrawlDatum p = new CrawlDatum(); + p.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); + p.setFetchInterval(interval); + p.setScore(1.0f); + p.setFetchTime(0); + return p; + } + + /** + * + * The Method validates interval values according to changed parameter. + * + * @param changed + * status value to check calculated interval value. + * @param getInterval + * to test IntervalValue from CrawlDatum which is calculated via + * AdaptiveFetchSchedule algorithm. + */ + private void validateFetchInterval(int changed, int getInterval) { + + if (changed == FetchSchedule.STATUS_UNKNOWN) { + assertEquals(getInterval, interval); + + } else if (changed == FetchSchedule.STATUS_MODIFIED) { + calculateInterval = (int) (interval - (interval * dec_rate)); + assertEquals(getInterval, calculateInterval); + + } else if (changed == FetchSchedule.STATUS_NOTMODIFIED) { + calculateInterval = (int) (interval + (interval * inc_rate)); + assertEquals(getInterval, calculateInterval); + } + + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java new file mode 100644 index 0000000..773dd29 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java @@ -0,0 +1,148 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.SequenceFile.Reader.Option; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.*; +import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.NutchJob; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * CrawlDbFiltering test which tests for correct, error free url normalization + * when the CrawlDB includes urls with <code>DB GONE</code> status and + * <code>CRAWLDB_PURGE_404</code> is set to true. + * + * @author lufeng + */ +public class TestCrawlDbFilter { + Configuration conf; + Path dbDir; + Path newCrawlDb; + final static Path testdir = new Path("build/test/crawldbfilter-test"); + FileSystem fs; + + @Before + public void setUp() throws Exception { + conf = CrawlDBTestUtil.createConfiguration(); + fs = FileSystem.get(conf); + fs.delete(testdir, true); + } + + @After + public void tearDown() { + delete(testdir); + } + + private void delete(Path p) { + try { + fs.delete(p, true); + } catch (IOException e) { + } + } + + /** + * Test url404Purging + * + * @throws Exception + */ + @Test + @Category({IntegrationTest.class}) + public void testUrl404Purging() throws Exception { + // create a CrawlDatum with DB GONE status + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); + list.add(new URLCrawlDatum(new Text("http://www.example.com"), + new CrawlDatum(CrawlDatum.STATUS_DB_GONE, 0, 0.0f))); + list.add(new URLCrawlDatum(new Text("http://www.example1.com"), + new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f))); + list.add(new URLCrawlDatum(new Text("http://www.example2.com"), + new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f))); + dbDir = new Path(testdir, "crawldb"); + newCrawlDb = new Path(testdir, "newcrawldb"); + // create crawldb + CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list); + // set CRAWLDB_PURGE_404 to true + conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404, true); + conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true); + conf.setBoolean(CrawlDbFilter.URL_FILTERING, false); + conf.setInt("urlnormalizer.loop.count", 2); + JobConf job = new NutchJob(conf); + job.setJobName("Test CrawlDbFilter"); + Path current = new Path(dbDir, "current"); + if (FileSystem.get(job).exists(current)) { + FileInputFormat.addInputPath(job, current); + } + job.setInputFormat(SequenceFileInputFormat.class); + job.setMapperClass(CrawlDbFilter.class); + job.setReducerClass(CrawlDbReducer.class); + FileOutputFormat.setOutputPath(job, newCrawlDb); + job.setOutputFormat(MapFileOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(CrawlDatum.class); + JobClient.runJob(job); + + Path fetchlist = new Path(new Path(newCrawlDb, "part-00000"), "data"); + + ArrayList<URLCrawlDatum> l = readContents(fetchlist); + + // verify we got right amount of records + Assert.assertEquals(2, l.size()); + } + + /** + * Read contents of fetchlist. + * + * @param fetchlist + * path to Generated fetchlist + * @return Generated {@link URLCrawlDatum} objects + * @throws IOException + */ + private ArrayList<URLCrawlDatum> readContents(Path fetchlist) + throws IOException { + // verify results + Option fFile = SequenceFile.Reader.file(fetchlist); + SequenceFile.Reader reader = new SequenceFile.Reader(conf, fFile); + + ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>(); + + READ: do { + Text key = new Text(); + CrawlDatum value = new CrawlDatum(); + if (!reader.next(key, value)) { + break READ; + } + l.add(new URLCrawlDatum(key, value)); + } while (true); + + reader.close(); + return l; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java new file mode 100644 index 0000000..599c353 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java @@ -0,0 +1,163 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.TreeSet; +import java.util.logging.Logger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.MapFile.Writer.Option; +import org.apache.hadoop.mapred.JobConf; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +public class TestCrawlDbMerger { + private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class + .getName()); + + String url10 = "http://example.com/"; + String url11 = "http://example.com/foo"; + String url20 = "http://example.com/"; + String url21 = "http://example.com/bar"; + String[] urls_expected = new String[] { url10, url11, url21 }; + + TreeSet<String> init1 = new TreeSet<String>(); + TreeSet<String> init2 = new TreeSet<String>(); + HashMap<String, CrawlDatum> expected = new HashMap<String, CrawlDatum>(); + CrawlDatum cd1, cd2, cd3; + Configuration conf; + FileSystem fs; + Path testDir; + CrawlDbReader reader; + + @Before + public void setUp() throws Exception { + init1.add(url10); + init1.add(url11); + init2.add(url20); + init2.add(url21); + long time = System.currentTimeMillis(); + cd1 = new CrawlDatum(); + cd1.setFetchInterval(1.0f); + cd1.setFetchTime(time); + cd1.getMetaData().put(new Text("name"), new Text("cd1")); + cd1.getMetaData().put(new Text("cd1"), new Text("cd1")); + cd2 = new CrawlDatum(); + cd2.setFetchInterval(1.0f); + cd2.setFetchTime(time + 10000); + cd2.getMetaData().put(new Text("name"), new Text("cd2")); + cd3 = new CrawlDatum(); + cd3.setFetchInterval(1.0f); + cd3.setFetchTime(time + 10000); + cd3.getMetaData().putAll(cd1.getMetaData()); + cd3.getMetaData().putAll(cd2.getMetaData()); + expected.put(url10, cd3); + expected.put(url11, cd1); + expected.put(url21, cd2); + conf = NutchConfiguration.create(); + fs = FileSystem.get(conf); + testDir = new Path("test-crawldb-" + new java.util.Random().nextInt()); + fs.mkdirs(testDir); + } + + @After + public void tearDown() { + try { + if (fs.exists(testDir)) + fs.delete(testDir, true); + } catch (Exception e) { + } + try { + reader.close(); + } catch (Exception e) { + } + } + + /** + * Test creates two sample {@link org.apache.nutch.crawl.CrawlDb}'s + * populating entries for keys as {@link org.apache.hadoop.io.Text} e.g. URLs + * and values as {@link org.apache.nutch.crawl.CrawlDatum} e.g. record data. + * It then simulates a merge process for the two CrawlDb's via the {@link org.apache.nutch.crawl.CrawlDbMerger} + * tool. The merged CrawlDb is then written to an arbitrary output location and the results + * read using the {@link org.apache.nutch.crawl.CrawlDbReader} tool. + * Test assertions include comparing expected CrawlDb key, value (URL, CrawlDatum) values + * with actual results based on the merge process. + * @throws Exception + */ + @Test + @Category({IntegrationTest.class}) + public void testMerge() throws Exception { + Path crawldb1 = new Path(testDir, "crawldb1"); + Path crawldb2 = new Path(testDir, "crawldb2"); + Path output = new Path(testDir, "output"); + createCrawlDb(conf, fs, crawldb1, init1, cd1); + createCrawlDb(conf, fs, crawldb2, init2, cd2); + CrawlDbMerger merger = new CrawlDbMerger(conf); + LOG.fine("* merging crawldbs to " + output); + merger.merge(output, new Path[] { crawldb1, crawldb2 }, false, false); + LOG.fine("* reading crawldb: " + output); + reader = new CrawlDbReader(); + String crawlDb = output.toString(); + Iterator<String> it = expected.keySet().iterator(); + while (it.hasNext()) { + String url = it.next(); + LOG.fine("url=" + url); + CrawlDatum cd = expected.get(url); + CrawlDatum res = reader.get(crawlDb, url, new JobConf(conf)); + LOG.fine(" -> " + res); + System.out.println("url=" + url); + System.out.println(" cd " + cd); + System.out.println(" res " + res); + // may not be null + Assert.assertNotNull(res); + Assert.assertTrue(cd.equals(res)); + } + reader.close(); + fs.delete(testDir, true); + } + + private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb, + TreeSet<String> init, CrawlDatum cd) throws Exception { + LOG.fine("* creating crawldb: " + crawldb); + Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME); + + Option wKeyOpt = MapFile.Writer.keyClass(Text.class); + org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class); + + MapFile.Writer writer = new MapFile.Writer(config, new Path(dir, + "part-r-00000"), wKeyOpt, wValueOpt); + Iterator<String> it = init.iterator(); + while (it.hasNext()) { + String key = it.next(); + writer.append(new Text(key), cd); + } + writer.close(); + } +}
