[ https://issues.apache.org/jira/browse/NUTCH-2856?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17890230#comment-17890230 ]
ASF GitHub Bot commented on NUTCH-2856: --------------------------------------- HiranChaudhuri commented on code in PR #826: URL: https://github.com/apache/nutch/pull/826#discussion_r1803674538 ########## src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java: ########## @@ -0,0 +1,292 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.smb; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.protocol.RobotRulesParser; +import com.hierynomus.msdtyp.AccessMask; +import com.hierynomus.msfscc.FileAttributes; +import com.hierynomus.msfscc.fileinformation.FileAllInformation; +import com.hierynomus.msfscc.fileinformation.FileIdBothDirectoryInformation; +import com.hierynomus.mssmb2.SMB2CreateDisposition; +import com.hierynomus.mssmb2.SMB2CreateOptions; +import com.hierynomus.mssmb2.SMB2ShareAccess; +import com.hierynomus.smbj.auth.AuthenticationContext; +import com.hierynomus.smbj.connection.Connection; +import com.hierynomus.smbj.session.Session; +import com.hierynomus.smbj.share.DiskShare; +import com.hierynomus.smbj.share.File; +import com.hierynomus.smbj.SMBClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import crawlercommons.robots.BaseRobotRules; + +public class Smb implements Protocol { + protected static final Logger LOG = LoggerFactory.getLogger(Smb.class); + + private Configuration conf; + + private String user; + private String password; + private String domain; + private int contentLimit; + private Set<String> ignoreFiles; + + public Smb() { + // todo: files that should be skipped could be configurable. + this.ignoreFiles = new HashSet<>(); + ignoreFiles.add("."); + ignoreFiles.add(".."); + ignoreFiles.add(".svn"); + ignoreFiles.add(".git"); + } + + @Override + public Configuration getConf() { + LOG.debug("getConf()"); + return this.conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + + // todo: is it possible to use configuration "per server" or "per share"? Review Comment: It is implemented now. Per default the plugin will search for url-authentication.xml. An example structure is in url-authentication.xml.template With that I think we can resolve it. > Implement a protocol-smb plugin based on hierynomus/smbj > -------------------------------------------------------- > > Key: NUTCH-2856 > URL: https://issues.apache.org/jira/browse/NUTCH-2856 > Project: Nutch > Issue Type: New Feature > Components: external, plugin, protocol > Reporter: Hiran Chaudhuri > Assignee: Hiran Chaudhuri > Priority: Major > Fix For: 1.21 > > > The plugin protocol-smb advertized on > [https://cwiki.apache.org/confluence/display/NUTCH/PluginCentral] actually > refers to the JCIFS library. According to this library's homepage > [https://www.jcifs.org/]: > _If you're looking for the latest and greatest open source Java SMB library, > this is not it. JCIFS has been in maintenance-mode-only for several years and > although what it does support works fine (SMB1, NTLMv2, midlc, MSRPC and > various utility classes), jCIFS does not support the newer SMB2/3 variants of > the SMB protocol which is slowly becoming required (Windows 10 requires > SMB2/3). JCIFS only supports SMB1 but Microsoft has deprecated SMB1 in their > products. *So if SMB1 is disabled on your network, JCIFS' file related > operations will NOT work.*_ > Looking at > [https://en.wikipedia.org/wiki/Server_Message_Block#SMB_/_CIFS_/_SMB1:|https://en.wikipedia.org/wiki/Server_Message_Block#SMB_/_CIFS_/_SMB1] > _Microsoft added SMB1 to the Windows Server 2012 R2 deprecation list in June > 2013. Windows Server 2016 and some versions of Windows 10 Fall Creators > Update do not have SMB1 installed by default._ > As a conclusion, the chances that SMB1 protocol is installed and/or > configured are getting vastly smaller. Therefore some migration towards > SMB2/3 is required. Luckily the JCIFS homepage lists alternatives: > * [jcifs-codelibs|https://github.com/codelibs/jcifs] > * [jcifs-ng|https://github.com/AgNO3/jcifs-ng] > * [smbj|https://github.com/hierynomus/smbj] -- This message was sent by Atlassian Jira (v8.20.10#820010)