bbende commented on code in PR #8765: URL: https://github.com/apache/nifi/pull/8765#discussion_r1594170696
########## nifi-extension-bundles/nifi-github-bundle/nifi-github-extensions/src/main/java/org/apache/nifi/github/GitHubFlowRegistryClient.java: ########## @@ -0,0 +1,633 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.apache.nifi.github; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.databind.json.JsonMapper; +import com.fasterxml.jackson.databind.type.TypeFactory; +import com.fasterxml.jackson.module.jakarta.xmlbind.JakartaXmlBindAnnotationIntrospector; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.flow.ConnectableComponent; +import org.apache.nifi.flow.Position; +import org.apache.nifi.flow.VersionedComponent; +import org.apache.nifi.flow.VersionedConnection; +import org.apache.nifi.flow.VersionedFlowCoordinates; +import org.apache.nifi.flow.VersionedProcessGroup; +import org.apache.nifi.processor.util.StandardValidators; +import org.apache.nifi.registry.flow.AbstractFlowRegistryClient; +import org.apache.nifi.registry.flow.BucketLocation; +import org.apache.nifi.registry.flow.FlowLocation; +import org.apache.nifi.registry.flow.FlowRegistryBranch; +import org.apache.nifi.registry.flow.FlowRegistryBucket; +import org.apache.nifi.registry.flow.FlowRegistryClientConfigurationContext; +import org.apache.nifi.registry.flow.FlowRegistryException; +import org.apache.nifi.registry.flow.FlowRegistryPermissions; +import org.apache.nifi.registry.flow.FlowVersionLocation; +import org.apache.nifi.registry.flow.RegisterAction; +import org.apache.nifi.registry.flow.RegisteredFlow; +import org.apache.nifi.registry.flow.RegisteredFlowSnapshot; +import org.apache.nifi.registry.flow.RegisteredFlowSnapshotMetadata; +import org.kohsuke.github.GHCommit; +import org.kohsuke.github.GHContent; +import org.kohsuke.github.GHContentUpdateResponse; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; + +/** + * Implementation of {@link org.apache.nifi.registry.flow.FlowRegistryClient} that uses GitHub for version controlling flows. + */ +public class GitHubFlowRegistryClient extends AbstractFlowRegistryClient { + + static final PropertyDescriptor GITHUB_API_URL = new PropertyDescriptor.Builder() + .name("GitHub API URL") + .description("The URL of the GitHub API") + .addValidator(StandardValidators.URL_VALIDATOR) + .defaultValue("https://api.github.com/") + .required(true) + .build(); + + static final PropertyDescriptor REPOSITORY_NAME = new PropertyDescriptor.Builder() + .name("Repository Name") + .description("The name of the repository") + .addValidator(StandardValidators.NON_BLANK_VALIDATOR) + .required(true) + .build(); + + static final PropertyDescriptor REPOSITORY_OWNER = new PropertyDescriptor.Builder() + .name("Repository Owner") + .description("The owner of the repository") + .addValidator(StandardValidators.NON_BLANK_VALIDATOR) + .required(true) + .build(); + + static final PropertyDescriptor REPOSITORY_BRANCH = new PropertyDescriptor.Builder() + .name("Default Branch") + .description("The default branch to use for this client") + .addValidator(StandardValidators.NON_BLANK_VALIDATOR) + .defaultValue("main") + .required(true) + .build(); + + static final PropertyDescriptor REPOSITORY_PATH = new PropertyDescriptor.Builder() + .name("Repository Path") + .description("The path with in the repository that this client will use to store all data. " + + "If left blank, then the root of the repository will be used.") + .addValidator(StandardValidators.NON_BLANK_VALIDATOR) + .required(false) + .build(); + + static final PropertyDescriptor AUTHENTICATION_TYPE = new PropertyDescriptor.Builder() + .name("Authentication Type") + .description("The type of authentication to use for accessing GitHub") + .allowableValues(GitHubAuthenticationType.values()) + .defaultValue(GitHubAuthenticationType.NONE.name()) + .required(true) + .build(); + + static final PropertyDescriptor PERSONAL_ACCESS_TOKEN = new PropertyDescriptor.Builder() + .name("Personal Access Token") + .description("The personal access token to use for authentication") + .addValidator(StandardValidators.NON_BLANK_VALIDATOR) + .required(true) + .sensitive(true) + .dependsOn(AUTHENTICATION_TYPE, GitHubAuthenticationType.PERSONAL_ACCESS_TOKEN.name()) + .build(); + + static final PropertyDescriptor APP_INSTALLATION_TOKEN = new PropertyDescriptor.Builder() + .name("App Installation Token") + .description("The app installation token to use for authentication") + .addValidator(StandardValidators.NON_BLANK_VALIDATOR) + .required(true) + .sensitive(true) + .dependsOn(AUTHENTICATION_TYPE, GitHubAuthenticationType.APP_INSTALLATION_TOKEN.name()) + .build(); + + static final List<PropertyDescriptor> PROPERTY_DESCRIPTORS = List.of( + GITHUB_API_URL, + REPOSITORY_OWNER, + REPOSITORY_NAME, + REPOSITORY_BRANCH, + REPOSITORY_PATH, + AUTHENTICATION_TYPE, + PERSONAL_ACCESS_TOKEN, + APP_INSTALLATION_TOKEN + ); + + private static final ObjectMapper OBJECT_MAPPER = JsonMapper.builder() + .serializationInclusion(JsonInclude.Include.NON_NULL) + .defaultPropertyInclusion(JsonInclude.Value.construct(JsonInclude.Include.NON_NULL, JsonInclude.Include.NON_NULL)) + .annotationIntrospector(new JakartaXmlBindAnnotationIntrospector(TypeFactory.defaultInstance())) + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + .configure(MapperFeature.SORT_PROPERTIES_ALPHABETICALLY, true) + .enable(SerializationFeature.INDENT_OUTPUT) + .addModule(new VersionedComponentModule()) + .build(); + + static final String DEFAULT_BUCKET_NAME = "default"; + static final String DEFAULT_BUCKET_KEEP_FILE_PATH = DEFAULT_BUCKET_NAME + "/.keep"; + static final String DEFAULT_BUCKET_KEEP_FILE_CONTENT = "Do Not Delete"; + static final String DEFAULT_BUCKET_KEEP_FILE_MESSAGE = "Creating Default bucket"; + + static final String REGISTER_FLOW_COMMENT = "Register Flow"; + static final String DEREGISTER_FLOW_COMMENT = "Deregister Flow"; + static final String DEFAULT_FLOW_SNAPSHOT_COMMIT_MESSAGE = "Saving Flow Snapshot"; + static final String SNAPSHOT_FILE_EXTENSION = ".json"; + static final String SNAPSHOT_FILE_PATH_FORMAT = "%s/%s" + SNAPSHOT_FILE_EXTENSION; + static final String FLOW_CONTENTS_GROUP_ID = "flow-contents-group"; + + private volatile GitHubRepositoryClient repositoryClient; + private final AtomicBoolean initialized = new AtomicBoolean(false); + + @Override + protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { + return PROPERTY_DESCRIPTORS; + } + + @Override + protected Collection<ValidationResult> customValidate(final ValidationContext validationContext) { + final List<ValidationResult> results = new ArrayList<>(super.customValidate(validationContext)); + + final String repoPath = validationContext.getProperty(REPOSITORY_PATH).getValue(); + if (repoPath != null && (repoPath.startsWith("/") || repoPath.endsWith("/"))) { + results.add(new ValidationResult.Builder() + .subject(REPOSITORY_PATH.getDisplayName()) + .valid(false) + .explanation("Path can not start or end with /") + .build()); + } + + return results; + } + + @Override + public void onPropertyModified(final PropertyDescriptor descriptor, final String oldValue, final String newValue) { + super.onPropertyModified(descriptor, oldValue, newValue); + synchronized (this) { + initialized.set(false); + } + } + + @Override + public boolean isStorageLocationApplicable(final FlowRegistryClientConfigurationContext context, final String location) { + return false; + } + + @Override + public boolean isBranchingSupported(final FlowRegistryClientConfigurationContext context) { + return true; + } + + @Override + public Set<FlowRegistryBranch> getBranches(final FlowRegistryClientConfigurationContext context) throws FlowRegistryException, IOException { + final GitHubRepositoryClient repositoryClient = getRepositoryClient(context); + return repositoryClient.getBranches().stream() + .map(branchName -> { + final FlowRegistryBranch flowRegistryBranch = new FlowRegistryBranch(); + flowRegistryBranch.setName(branchName); + return flowRegistryBranch; + }).collect(Collectors.toSet()); + } + + @Override + public FlowRegistryBranch getDefaultBranch(final FlowRegistryClientConfigurationContext context) { + final FlowRegistryBranch defaultBranch = new FlowRegistryBranch(); + defaultBranch.setName(context.getProperty(REPOSITORY_BRANCH).getValue()); + return defaultBranch; + } + + @Override + public Set<FlowRegistryBucket> getBuckets(final FlowRegistryClientConfigurationContext context, final String branch) throws IOException, FlowRegistryException { + final GitHubRepositoryClient repositoryClient = getRepositoryClient(context); + final Set<FlowRegistryBucket> buckets = repositoryClient.getDirectoryNames("", branch).stream() + .map(this::createFlowRegistryBucket) + .collect(Collectors.toSet()); + + // if the repository has no top-level directories, then return a default bucket entry, this won't exist in the repository until the first time a flow is saved to it + return buckets.isEmpty() ? Set.of(createFlowRegistryBucket(DEFAULT_BUCKET_NAME)) : buckets; + } + + @Override + public FlowRegistryBucket getBucket(final FlowRegistryClientConfigurationContext context, final BucketLocation bucketLocation) throws FlowRegistryException, IOException { + return createFlowRegistryBucket(bucketLocation.getBucketId()); + } + + @Override + public RegisteredFlow registerFlow(final FlowRegistryClientConfigurationContext context, final RegisteredFlow flow) throws FlowRegistryException, IOException { + final GitHubRepositoryClient repositoryClient = getRepositoryClient(context); + + final String branch = flow.getBranch(); + final FlowLocation flowLocation = new FlowLocation(branch, flow.getBucketIdentifier(), flow.getIdentifier()); + final String filePath = getSnapshotFilePath(flowLocation); + + final Optional<String> existingFileSha = repositoryClient.getContentSha(filePath, branch); + if (existingFileSha.isPresent()) { + throw new FlowRegistryException("Another flow is already registered at [" + filePath + "] on branch [" + branch + "]"); + } + + // Clear values we don't want in the json stored in GitHub + final String originalBucketId = flow.getBucketIdentifier(); + flow.setBucketIdentifier(null); + flow.setBucketName(null); + flow.setBranch(null); + + final RegisteredFlowSnapshot flowSnapshot = new RegisteredFlowSnapshot(); + flowSnapshot.setBucket(null); + flowSnapshot.setFlow(flow); + + final GitHubCreateContentRequest request = GitHubCreateContentRequest.builder() + .branch(branch) + .path(filePath) + .content(OBJECT_MAPPER.writeValueAsString(flowSnapshot)) + .message(REGISTER_FLOW_COMMENT) + .build(); + + repositoryClient.createContent(request); + + // Re-populate fields before returning + flow.setBucketName(originalBucketId); + flow.setBucketIdentifier(originalBucketId); + flow.setBranch(branch); + + return flow; + } + + @Override + public RegisteredFlow deregisterFlow(final FlowRegistryClientConfigurationContext context, final FlowLocation flowLocation) throws FlowRegistryException, IOException { + final GitHubRepositoryClient repositoryClient = getRepositoryClient(context); + + final String branch = flowLocation.getBranch(); + final String filePath = getSnapshotFilePath(flowLocation); + final GHContent deletedSnapshotContent = repositoryClient.deleteContent(filePath, DEREGISTER_FLOW_COMMENT, branch); + + final RegisteredFlowSnapshot deletedSnapshot = getSnapshot(deletedSnapshotContent.read()); + updateBucketReferences(deletedSnapshot, flowLocation.getBucketId()); + return deletedSnapshot.getFlow(); + } + + @Override + public RegisteredFlow getFlow(final FlowRegistryClientConfigurationContext context, final FlowLocation flowLocation) throws FlowRegistryException, IOException { + final String branch = flowLocation.getBranch(); + final String filePath = getSnapshotFilePath(flowLocation); + + final RegisteredFlowSnapshot existingSnapshot = getSnapshot(filePath, branch); + populateFlowAndSnapshotMetadata(existingSnapshot, flowLocation); + updateBucketReferences(existingSnapshot, flowLocation.getBucketId()); + + final RegisteredFlow registeredFlow = existingSnapshot.getFlow(); + registeredFlow.setBranch(branch); + return registeredFlow; + } + + @Override + public Set<RegisteredFlow> getFlows(final FlowRegistryClientConfigurationContext context, final BucketLocation bucketLocation) throws IOException, FlowRegistryException { + final GitHubRepositoryClient repositoryClient = getRepositoryClient(context); + + final String branch = bucketLocation.getBranch(); + final String bucketId = bucketLocation.getBucketId(); + final Set<RegisteredFlow> registeredFlows = new LinkedHashSet<>(); + + for (final String filename : repositoryClient.getFileNames(bucketId, branch)) { + if (!filename.endsWith(SNAPSHOT_FILE_EXTENSION)) { + continue; + } + + final String flowId = filename.replace(SNAPSHOT_FILE_EXTENSION, ""); + final RegisteredFlow registeredFlow = new RegisteredFlow(); + registeredFlow.setIdentifier(flowId); + registeredFlow.setName(flowId); + registeredFlow.setBranch(branch); + registeredFlow.setBucketIdentifier(bucketId); + registeredFlow.setBucketName(bucketId); + registeredFlows.add(registeredFlow); + } + + return registeredFlows; + } + + @Override + public RegisteredFlowSnapshot getFlowContents(final FlowRegistryClientConfigurationContext context, final FlowVersionLocation flowVersionLocation) + throws FlowRegistryException, IOException { + final GitHubRepositoryClient repositoryClient = getRepositoryClient(context); + + final String version = flowVersionLocation.getVersion(); + final String filePath = getSnapshotFilePath(flowVersionLocation); + + final InputStream inputStream = repositoryClient.getContentFromCommit(filePath, version); + final RegisteredFlowSnapshot flowSnapshot = getSnapshot(inputStream); + populateFlowAndSnapshotMetadata(flowSnapshot, flowVersionLocation); + + // populate values that aren't store in GitHub + flowSnapshot.getSnapshotMetadata().setVersion(version); + flowSnapshot.getSnapshotMetadata().setBranch(flowVersionLocation.getBranch()); + flowSnapshot.getFlow().setBranch(flowVersionLocation.getBranch()); + + // populate outgoing bucket references + updateBucketReferences(flowSnapshot, flowVersionLocation.getBucketId()); + + // determine if the version is the "latest" version by comparing to the response of getLatestVersion + final String latestVersion = getLatestVersion(context, flowVersionLocation).orElse(null); + flowSnapshot.setLatest(version.equals(latestVersion)); + + return flowSnapshot; + } + + @Override + public RegisteredFlowSnapshot registerFlowSnapshot(final FlowRegistryClientConfigurationContext context, final RegisteredFlowSnapshot flowSnapshot, final RegisterAction action) + throws FlowRegistryException, IOException { + final GitHubRepositoryClient repositoryClient = getRepositoryClient(context); + final RegisteredFlowSnapshotMetadata snapshotMetadata = flowSnapshot.getSnapshotMetadata(); + + final String branch = snapshotMetadata.getBranch(); + final FlowLocation flowLocation = new FlowLocation(snapshotMetadata.getBranch(), snapshotMetadata.getBucketIdentifier(), snapshotMetadata.getFlowIdentifier()); + final String filePath = getSnapshotFilePath(flowLocation); + final String previousSha = repositoryClient.getContentSha(filePath, branch).orElse(null); + + final String snapshotComments = snapshotMetadata.getComments(); + final String commitMessage = StringUtils.isBlank(snapshotComments) ? DEFAULT_FLOW_SNAPSHOT_COMMIT_MESSAGE : snapshotComments; + + final RegisteredFlowSnapshot existingSnapshot = getSnapshot(filePath, branch); + populateFlowAndSnapshotMetadata(existingSnapshot, flowLocation); + + final RegisteredFlow existingFlow = existingSnapshot.getFlow(); + existingFlow.setBranch(null); + flowSnapshot.setFlow(existingFlow); + + // Clear values we don't want stored in the json in GitHub + flowSnapshot.setBucket(null); + flowSnapshot.getSnapshotMetadata().setBucketIdentifier(null); + flowSnapshot.getSnapshotMetadata().setBranch(null); + flowSnapshot.getSnapshotMetadata().setVersion(null); + flowSnapshot.getSnapshotMetadata().setComments(null); + flowSnapshot.getSnapshotMetadata().setTimestamp(0); + + // replace the id of the top level group and all of its references with a constant value prior to serializing to avoid + // unnecessary diffs when different instances of the same flow are imported and have different top-level PG ids + final String originalFlowContentsGroupId = replaceGroupId(flowSnapshot.getFlowContents(), FLOW_CONTENTS_GROUP_ID); + final Position originalFlowContentsPosition = replacePosition(flowSnapshot.getFlowContents(), new Position(0, 0)); + + final GitHubCreateContentRequest createContentRequest = GitHubCreateContentRequest.builder() + .branch(branch) + .path(filePath) + .content(OBJECT_MAPPER.writeValueAsString(flowSnapshot)) + .message(commitMessage) + .existingContentSha(previousSha) + .build(); + + final GHContentUpdateResponse createContentResponse = repositoryClient.createContent(createContentRequest); + final String createContentCommitSha = createContentResponse.getCommit().getSha(); + + final VersionedFlowCoordinates versionedFlowCoordinates = new VersionedFlowCoordinates(); + versionedFlowCoordinates.setRegistryId(getIdentifier()); + versionedFlowCoordinates.setBranch(flowLocation.getBranch()); + versionedFlowCoordinates.setBucketId(flowLocation.getBucketId()); + versionedFlowCoordinates.setFlowId(flowLocation.getFlowId()); + versionedFlowCoordinates.setVersion(createContentCommitSha); + + flowSnapshot.getFlowContents().setVersionedFlowCoordinates(versionedFlowCoordinates); + flowSnapshot.getFlow().setBranch(branch); + flowSnapshot.getSnapshotMetadata().setBranch(branch); + flowSnapshot.getSnapshotMetadata().setVersion(createContentCommitSha); + flowSnapshot.setLatest(true); + + // populate outgoing bucket references + updateBucketReferences(flowSnapshot, flowLocation.getBucketId()); + + // set back to the original id so that the returned snapshot is has the correct values from what was passed in + replaceGroupId(flowSnapshot.getFlowContents(), originalFlowContentsGroupId); + replacePosition(flowSnapshot.getFlowContents(), originalFlowContentsPosition); + + return flowSnapshot; + } + + @Override + public Set<RegisteredFlowSnapshotMetadata> getFlowVersions(final FlowRegistryClientConfigurationContext context, final FlowLocation flowLocation) + throws FlowRegistryException, IOException { + final GitHubRepositoryClient repositoryClient = getRepositoryClient(context); + + final String branch = flowLocation.getBranch(); + final String filePath = getSnapshotFilePath(flowLocation); + + final Set<RegisteredFlowSnapshotMetadata> snapshotMetadataSet = new LinkedHashSet<>(); + for (final GHCommit ghCommit : repositoryClient.getCommits(filePath, branch)) { + final RegisteredFlowSnapshotMetadata snapshotMetadata = createSnapshotMetadata(ghCommit, flowLocation); + if (REGISTER_FLOW_COMMENT.equals(snapshotMetadata.getComments())) { + continue; + } + snapshotMetadataSet.add(snapshotMetadata); + } + return snapshotMetadataSet; + } + + @Override + public Optional<String> getLatestVersion(final FlowRegistryClientConfigurationContext context, final FlowLocation flowLocation) throws FlowRegistryException, IOException { + final GitHubRepositoryClient repositoryClient = getRepositoryClient(context); + + final String branch = flowLocation.getBranch(); + final String filePath = getSnapshotFilePath(flowLocation); + + final List<GHCommit> commits = repositoryClient.getCommits(filePath, branch); + final String latestVersion = commits.isEmpty() ? null : commits.getFirst().getSHA1(); + return Optional.ofNullable(latestVersion); + } + + @Override + public String generateFlowId(final String flowName) { + return flowName.trim() + .replaceAll("\\s", "-") // replace whitespace with - + .replaceAll("[^a-zA-Z0-9-]", "") // replace all other invalid chars with empty string + .replaceAll("(-)\\1+", "$1"); // replace consecutive - with single - + } + + private FlowRegistryBucket createFlowRegistryBucket(final String name) { + final FlowRegistryPermissions bucketPermissions = new FlowRegistryPermissions(); + bucketPermissions.setCanRead(true); Review Comment: Good question. First thing to keep in mind here is that buckets in GitHub are directories, which themselves don't have permissions. If you have write access to the repo you have write access to all buckets, etc. This concept of bucket permissions came from NiFi Registry where the bucket can have permissions, and NiFi Registry can tell NiFi what the permissions are. I think what we really need is permissions on the registry client, because for GitHub its more of an all or nothing. If the client has a token with WRITE access to the repo, then the client has WRITE for all buckets, etc. We currently don't have a concept of permissions on registry clients, and the current authorization is extremely restrictive: https://issues.apache.org/jira/browse/NIFI-12482 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
