[ https://issues.apache.org/jira/browse/NUTCH-2856?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17889297#comment-17889297 ]
ASF GitHub Bot commented on NUTCH-2856: --------------------------------------- lewismc commented on code in PR #826: URL: https://github.com/apache/nutch/pull/826#discussion_r1799918553 ########## src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java: ########## @@ -0,0 +1,292 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.smb; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.protocol.RobotRulesParser; +import com.hierynomus.msdtyp.AccessMask; +import com.hierynomus.msfscc.FileAttributes; +import com.hierynomus.msfscc.fileinformation.FileAllInformation; +import com.hierynomus.msfscc.fileinformation.FileIdBothDirectoryInformation; +import com.hierynomus.mssmb2.SMB2CreateDisposition; +import com.hierynomus.mssmb2.SMB2CreateOptions; +import com.hierynomus.mssmb2.SMB2ShareAccess; +import com.hierynomus.smbj.auth.AuthenticationContext; +import com.hierynomus.smbj.connection.Connection; +import com.hierynomus.smbj.session.Session; +import com.hierynomus.smbj.share.DiskShare; +import com.hierynomus.smbj.share.File; +import com.hierynomus.smbj.SMBClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import crawlercommons.robots.BaseRobotRules; + +public class Smb implements Protocol { + protected static final Logger LOG = LoggerFactory.getLogger(Smb.class); + + private Configuration conf; + + private String user; + private String password; + private String domain; + private int contentLimit; + private Set<String> ignoreFiles; + + public Smb() { + // todo: files that should be skipped could be configurable. + this.ignoreFiles = new HashSet<>(); + ignoreFiles.add("."); + ignoreFiles.add(".."); + ignoreFiles.add(".svn"); + ignoreFiles.add(".git"); + } + + @Override + public Configuration getConf() { + LOG.debug("getConf()"); + return this.conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + + // todo: is it possible to use configuration "per server" or "per share"? + user = conf.getTrimmed("smb.user"); + if (user == null || user.isEmpty()) { + throw new IllegalArgumentException("Config parameter 'smb.user' not set."); + } + password = conf.getTrimmed("smb.password"); + if (password == null || password.isEmpty()) { + throw new IllegalArgumentException("Config parameter 'smb.password' not set."); + } + domain = conf.getTrimmed("smb.domain"); + contentLimit = conf.getInt("smb.content.limit", Integer.MAX_VALUE); + } + + /** + * list directory. + * + * @return some HTML string + */ + private String getDirectoryContent(DiskShare share, String shareName, String path) throws UnsupportedEncodingException { + StringBuffer sb = new StringBuffer(); + sb.append("<html><head>"); + sb.append("<title>Index of ").append("/").append(shareName).append(path).append("</title>"); + sb.append("</head><body>"); + sb.append("<h1>Index of ").append("/").append(shareName).append(path).append("</h1>"); + sb.append("<pre>"); + for (FileIdBothDirectoryInformation f : share.list(path)) { + if (ignoreFiles.contains(f.getFileName())) { + LOG.warn("File skipped: " + f.getFileName()); + continue; + } + boolean isDir = share.folderExists(path + "/" + f.getFileName()); + + sb.append("<a href=\"").append(java.net.URLEncoder.encode(f.getFileName(), StandardCharsets.UTF_8.name())); + if (isDir) { + sb.append("/"); + } + sb.append("\">").append(f.getFileName()); + if (isDir) { + sb.append("/"); + } + sb.append("\t").append(f.getLastWriteTime()).append("</a>\n"); + } + sb.append("</pre>"); + sb.append("</body></html>"); + + return sb.toString(); + } + + private static final char[] HEX_ARRAY = "0123456789ABCDEF".toCharArray(); + + /** + * Get the {@link ProtocolOutput} for a given url and crawldatum. + * + * @param url canonical url + * @param datum associated {@link org.apache.nutch.crawl.CrawlDatum} + * @return the {@link ProtocolOutput} + * @see https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/CrawlDatum.java + */ + @Override + public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) { + LOG.warn("getProtocolOutput({}, {})", urlstr, datum); + + + + try { + String u = java.net.URLDecoder.decode(urlstr.toString(), StandardCharsets.UTF_8.name()); + u = u.split("://")[1]; + LOG.warn("u={}", u); + String[] components = u.split("[:/]", 2); + String hostname = components[0]; + String shareAndPath = components[1]; + LOG.warn("hostname={}", hostname); + LOG.warn("shareAndPath={}", shareAndPath); + components = shareAndPath.split("/", 2); + String shareName = components[0]; + String path = components.length>1 ? "/" + components[1]: "/"; + LOG.warn("share={}", shareName); + LOG.warn("path={}", path); + + // todo: we construct and destruct the connection for each and every URL. Can connection pools improve? + SMBClient client = new SMBClient(); Review Comment: Yes I think we need to experiment here a bit. > Implement a protocol-smb plugin based on hierynomus/smbj > -------------------------------------------------------- > > Key: NUTCH-2856 > URL: https://issues.apache.org/jira/browse/NUTCH-2856 > Project: Nutch > Issue Type: New Feature > Components: external, plugin, protocol > Reporter: Hiran Chaudhuri > Assignee: Hiran Chaudhuri > Priority: Major > Fix For: 1.21 > > > The plugin protocol-smb advertized on > [https://cwiki.apache.org/confluence/display/NUTCH/PluginCentral] actually > refers to the JCIFS library. According to this library's homepage > [https://www.jcifs.org/]: > _If you're looking for the latest and greatest open source Java SMB library, > this is not it. JCIFS has been in maintenance-mode-only for several years and > although what it does support works fine (SMB1, NTLMv2, midlc, MSRPC and > various utility classes), jCIFS does not support the newer SMB2/3 variants of > the SMB protocol which is slowly becoming required (Windows 10 requires > SMB2/3). JCIFS only supports SMB1 but Microsoft has deprecated SMB1 in their > products. *So if SMB1 is disabled on your network, JCIFS' file related > operations will NOT work.*_ > Looking at > [https://en.wikipedia.org/wiki/Server_Message_Block#SMB_/_CIFS_/_SMB1:|https://en.wikipedia.org/wiki/Server_Message_Block#SMB_/_CIFS_/_SMB1] > _Microsoft added SMB1 to the Windows Server 2012 R2 deprecation list in June > 2013. Windows Server 2016 and some versions of Windows 10 Fall Creators > Update do not have SMB1 installed by default._ > As a conclusion, the chances that SMB1 protocol is installed and/or > configured are getting vastly smaller. Therefore some migration towards > SMB2/3 is required. Luckily the JCIFS homepage lists alternatives: > * [jcifs-codelibs|https://github.com/codelibs/jcifs] > * [jcifs-ng|https://github.com/AgNO3/jcifs-ng] > * [smbj|https://github.com/hierynomus/smbj] -- This message was sent by Atlassian Jira (v8.20.10#820010)