This is an automated email from the ASF dual-hosted git repository.
hez pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-devlake.git
The following commit(s) were added to refs/heads/main by this push:
new 8c86c2cd0 4879 nested property extraction with json pointer (#4976)
8c86c2cd0 is described below
commit 8c86c2cd0018e9f58ddf0ec934ece3c6b09dca9d
Author: Camille Teruel <[email protected]>
AuthorDate: Sat Apr 22 07:22:49 2023 +0200
4879 nested property extraction with json pointer (#4976)
* fix: PullRequest base and head repo ids must be domain ids
* style: Use snake_cased attributes
* feat: Use same PR state values as Github plugin
* refactor: Simplify extractors and clean up models
* refactor: Remove unused db.py
* feat: Autoextraction via JSON-pointer
* refactor: Add field source to models to remove extract methods
---------
Co-authored-by: Camille Teruel <[email protected]>
---
backend/python/README.md | 18 +++-
.../plugins/azuredevops/azuredevops/helper/db.py | 28 -----
.../python/plugins/azuredevops/azuredevops/main.py | 6 +-
.../plugins/azuredevops/azuredevops/models.py | 117 +++------------------
.../azuredevops/azuredevops/streams/builds.py | 28 ++---
.../azuredevops/azuredevops/streams/jobs.py | 10 +-
.../azuredevops/streams/pull_request_commits.py | 16 +--
.../azuredevops/streams/pull_requests.py | 39 ++++---
backend/python/plugins/azuredevops/poetry.lock | 13 +++
.../plugins/azuredevops/tests/streams_test.py | 10 +-
backend/python/pydevlake/poetry.lock | 14 ++-
backend/python/pydevlake/pydevlake/__init__.py | 17 ++-
backend/python/pydevlake/pydevlake/extractor.py | 46 ++++++--
backend/python/pydevlake/pydevlake/model.py | 23 ++--
backend/python/pydevlake/pydevlake/stream.py | 3 +-
backend/python/pydevlake/pyproject.toml | 1 +
backend/python/pydevlake/tests/extractor_test.py | 87 +++++++++++++++
backend/python/test/fakeplugin/poetry.lock | 13 +++
18 files changed, 264 insertions(+), 225 deletions(-)
diff --git a/backend/python/README.md b/backend/python/README.md
index d0658c94a..062f6053e 100644
--- a/backend/python/README.md
+++ b/backend/python/README.md
@@ -233,7 +233,7 @@ class Users(Stream):
def collect(self, state, context) -> Iterable[Tuple[object, dict]]:
pass
- def extract(self, raw_data, context) -> ToolUser:
+ def extract(self, raw_data) -> ToolUser:
pass
def convert(self, user: ToolUser, context) -> Iterable[DomainUser]:
@@ -251,9 +251,19 @@ The `collect` method takes a `state` dictionary and a
context object and yields
The last state that the plugin yielded for a given connection will be reused
during the next collection.
The plugin can use this `state` to store information necessary to perform
incremental collection of data.
-The `extract` method takes a raw data object and a context object and returns
a tool model. This method has a default implementation that uses the
`tool_model` class attribute to create a new instance of the tool model and set
its attributes from the raw data (`self.tool_model(**raw_data)`).
-If the raw data collected from the datasource and is simple enough and well
aligned with your tool model, you can omit this method.
-Otherwise, you can override it to deal with e.g. nested data structures.
+The `extract` method takes a raw data object and returns a tool model.
+This method has a default implementation that populates an instance of the
`tool_model` class with the raw data.
+When you need to extract a nested value from JSON raw data, you can specify a
JSON pointer (see RFC 6901) in the as `source` argument to a `Field`
declaration.
+
+```python
+class User(ToolModel, table=True):
+ id: str = Field(primary_key=True)
+ name: str
+ email: str
+ address: str = Field(source="/contactInfo/address")
+```
+
+Here the address field will be populated with the value of the `address`
property of the `contactInfo` object property of the JSON object.
The `convert` method takes a tool-specific user model and convert it into
domain level user models.
Here the two models align quite well, the conversion is trivial:
diff --git a/backend/python/plugins/azuredevops/azuredevops/helper/db.py
b/backend/python/plugins/azuredevops/azuredevops/helper/db.py
deleted file mode 100644
index 0c1f81be3..000000000
--- a/backend/python/plugins/azuredevops/azuredevops/helper/db.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-
-# http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Type
-
-from sqlalchemy import sql
-from sqlmodel import Session
-
-from pydevlake import Context
-
-
-def get(ctx: Context, model_type: Type, *query) -> any:
- with Session(ctx.engine) as session:
- stmt = sql.select(model_type).filter(*query)
- model = session.exec(stmt).scalar()
- return model
diff --git a/backend/python/plugins/azuredevops/azuredevops/main.py
b/backend/python/plugins/azuredevops/azuredevops/main.py
index 17a27a815..42b98f186 100644
--- a/backend/python/plugins/azuredevops/azuredevops/main.py
+++ b/backend/python/plugins/azuredevops/azuredevops/main.py
@@ -47,7 +47,7 @@ class AzureDevOpsPlugin(Plugin):
yield Repo(
name=git_repo.name,
url=git_repo.url,
- forked_from=git_repo.parentRepositoryUrl
+ forked_from=git_repo.parent_repository_url
)
yield CicdScope(
@@ -85,10 +85,10 @@ class AzureDevOpsPlugin(Plugin):
url = url._replace(netloc=url.hostname)
raw_repo['url'] = url.geturl()
repo = GitRepository(**raw_repo)
- if not repo.defaultBranch:
+ if not repo.default_branch:
continue
if "parentRepository" in raw_repo:
- repo.parentRepositoryUrl = raw_repo["parentRepository"]["url"]
+ repo.parent_repository_url =
raw_repo["parentRepository"]["url"]
yield repo
def test_connection(self, connection: AzureDevOpsConnection):
diff --git a/backend/python/plugins/azuredevops/azuredevops/models.py
b/backend/python/plugins/azuredevops/azuredevops/models.py
index c374754e7..b194f0cf5 100644
--- a/backend/python/plugins/azuredevops/azuredevops/models.py
+++ b/backend/python/plugins/azuredevops/azuredevops/models.py
@@ -18,9 +18,7 @@ from enum import Enum
from typing import Optional
import re
-from sqlmodel import Field
-
-from pydevlake import Connection, TransformationRule
+from pydevlake import Field, Connection, TransformationRule
from pydevlake.model import ToolModel, ToolScope
from pydevlake.pipeline_tasks import RefDiffOptions
@@ -36,109 +34,52 @@ class AzureDevOpsTransformationRule(TransformationRule):
production_pattern: Optional[re.Pattern]
-class Project(ToolModel, table=True):
- id: str = Field(primary_key=True)
- name: str
- url: str
-
-
class GitRepository(ToolScope, table=True):
url: str
- sshUrl: str
remoteUrl: str
- defaultBranch: Optional[str]
- project_id: str # = Field(foreign_key=Project.id)
+ default_branch: Optional[str]
+ project_id: str
org_id: str
- size: int
- isDisabled: bool
- isInMaintenance: bool
- isFork: Optional[bool]
- parentRepositoryUrl: Optional[str]
+ parent_repository_url: Optional[str] = Field(source='parentRepository/url')
-#
https://learn.microsoft.com/en-us/rest/api/azure/devops/git/pull-requests/get-pull-requests?view=azure-devops-rest-7.1&tabs=HTTP#identityrefwithvote
class GitPullRequest(ToolModel, table=True):
class Status(Enum):
Abandoned = "abandoned"
Active = "active"
- All = "all"
Completed = "completed"
- NotSet = "notSet"
- id: int = Field(primary_key=True)
- project_id: Optional[str]
+ pull_request_id: int = Field(primary_key=True)
description: Optional[str]
- code_review_id: int = 0
- repo_id: Optional[str]
status: Status
- created_by_id: Optional[str]
- created_by_name: Optional[str]
+ created_by_id: str = Field(source='/createdBy/id')
+ created_by_name: str = Field(source='/createdBy/displayName')
creation_date: datetime.datetime
closed_date: Optional[datetime.datetime]
- source_commit_sha: Optional[str] # lastmergesourcecommit #base
- target_commit_sha: Optional[str] # lastmergetargetcommit #head
- merge_commit_sha: Optional[str]
+ source_commit_sha: str = Field(source='/lastMergeSourceCommit/commitId')
+ target_commit_sha: str = Field(source='/lastMergeTargetCommit/commitId')
+ merge_commit_sha: str = Field(source='/lastMergeCommit/commitId')
url: Optional[str]
- type: Optional[str]
+ type: Optional[str] = Field(source='/labels/0/name') # TODO: get this off
transformation rules regex
title: Optional[str]
target_ref_name: Optional[str]
source_ref_name: Optional[str]
- fork_repo_id: Optional[str]
+ fork_repo_id: Optional[str] = Field(source='/forkSource/repository/id')
class GitPullRequestCommit(ToolModel, table=True):
- commit_sha: str = Field(primary_key=True)
+ commit_id: str = Field(primary_key=True)
pull_request_id: str
- committer_name: str
- committer_email: str
- commit_date: datetime.datetime
- author_name: str
- author_email: str
- authored_date: datetime.datetime
- comment: str
- url: str
- additions: int
- deletions: int
-
-
-class Account(ToolModel, table=True):
- class Type(Enum):
- Organization = "organization"
- Personal = "personal"
-
- class Status(Enum):
- Deleted = "deleted"
- Disabled = "disabled"
- Enabled = "enabled"
- Moved = "moved"
- Non = "none"
-
- account_id: str = Field(primary_key=True)
- account_name: str
- account_owner: str
- account_type: Type
- account_status: Status
- organization_name: str
- namespace_id: str
class Build(ToolModel, table=True):
class Status(Enum):
- All = "all"
Cancelling = "cancelling"
Completed = "completed"
InProgress = "inProgress"
- Non = "none"
NotStarted = "notStarted"
Postponed = "postponed"
- class Priority(Enum):
- AboveNormal = "aboveNormal"
- BelowNormal = "belowNormal"
- High = "high"
- Low = "low"
- Normal = "normal"
-
class Result(Enum):
Canceled = "canceled"
Failed = "failed"
@@ -147,33 +88,16 @@ class Build(ToolModel, table=True):
Succeeded = "succeeded"
id: int = Field(primary_key=True)
- name: str
- project_id: str
- repo_id: str
- repo_type: str
- build_number: str
- build_number_revision: Optional[str]
- controller_id: Optional[str]
- definition_id: Optional[str]
- deleted: Optional[bool]
+ name: str = Field(source='/definition/name')
start_time: Optional[datetime.datetime]
finish_time: Optional[datetime.datetime]
status: Status
- tags: list[str] = []
- priority: Priority
- build_result: Result
+ result: Result
source_branch: str
source_version: str
class Job(ToolModel, table=True):
- class Type(Enum):
- Task = "Task"
- Job = "Job"
- Checkpoint = "Checkpoint"
- Stage = "Stage"
- Phase = "Phase"
-
class State(Enum):
Completed = "completed"
InProgress = "inProgress"
@@ -189,19 +113,8 @@ class Job(ToolModel, table=True):
id: str = Field(primary_key=True)
build_id: str
- parentId: Optional[str]
- type: Optional[Type]
name: str
startTime: datetime.datetime
finishTime: datetime.datetime
- lastModified: datetime.datetime
- currentOperation: Optional[int]
- percentComplete: Optional[int]
state: State
result: Result
- resultCode: Optional[int]
- changeId: Optional[int]
- workerName: Optional[str]
- order: Optional[int]
- errorCount: Optional[int]
- warningCount: Optional[int]
diff --git a/backend/python/plugins/azuredevops/azuredevops/streams/builds.py
b/backend/python/plugins/azuredevops/azuredevops/streams/builds.py
index bc105cc62..006d3315a 100644
--- a/backend/python/plugins/azuredevops/azuredevops/streams/builds.py
+++ b/backend/python/plugins/azuredevops/azuredevops/streams/builds.py
@@ -20,7 +20,7 @@ import iso8601 as iso8601
from azuredevops.api import AzureDevOpsAPI
from azuredevops.models import GitRepository
from azuredevops.models import Build
-from pydevlake import Context, DomainType, Stream, logger
+from pydevlake import Context, DomainType, Stream
import pydevlake.domain_layer.devops as devops
@@ -35,35 +35,19 @@ class Builds(Stream):
for raw_build in response:
yield raw_build, state
- def extract(self, raw_data: dict) -> Build:
- build: Build = self.tool_model(**raw_data)
- build.name = raw_data["definition"]["name"]
- build.project_id = raw_data["project"]["id"]
- build.repo_id = raw_data["repository"]["id"]
- build.repo_type = raw_data["repository"]["type"]
- build.build_number = raw_data["buildNumber"]
- build.tags = ",".join(raw_data["tags"])
- build.build_result = Build.Result(raw_data["result"])
- trigger_info: dict = raw_data["triggerInfo"]
- if "ci.sourceSha" in trigger_info: # this key is not guaranteed to be
in here per docs
- assert build.source_version == trigger_info["ci.sourceSha"]
- return build
-
def convert(self, b: Build, ctx: Context):
result = None
- if b.build_result == Build.Result.Canceled:
+ if b.result == Build.Result.Canceled:
result = devops.CICDResult.ABORT
- elif b.build_result == Build.Result.Failed:
+ elif b.result == Build.Result.Failed:
result = devops.CICDResult.FAILURE
- elif b.build_result == Build.Result.PartiallySucceeded:
+ elif b.result == Build.Result.PartiallySucceeded:
result = devops.CICDResult.SUCCESS
- elif b.build_result == Build.Result.Succeeded:
+ elif b.result == Build.Result.Succeeded:
result = devops.CICDResult.SUCCESS
status = None
- if b.status == Build.Status.All:
- status = devops.CICDStatus.IN_PROGRESS
- elif b.status == Build.Status.Cancelling:
+ if b.status == Build.Status.Cancelling:
status = devops.CICDStatus.DONE
elif b.status == Build.Status.Completed:
status = devops.CICDStatus.DONE
diff --git a/backend/python/plugins/azuredevops/azuredevops/streams/jobs.py
b/backend/python/plugins/azuredevops/azuredevops/streams/jobs.py
index 89e6e4a25..131739551 100644
--- a/backend/python/plugins/azuredevops/azuredevops/streams/jobs.py
+++ b/backend/python/plugins/azuredevops/azuredevops/streams/jobs.py
@@ -31,13 +31,9 @@ class Jobs(Substream):
repo: GitRepository = context.scope
api = AzureDevOpsAPI(context.connection)
response = api.jobs(repo.org_id, repo.project_id, parent.id)
- if response.status != 200:
- yield None, state
- else:
- for raw_job in response.json["records"]:
- raw_job["build_id"] = parent.domain_id()
- yield raw_job, state
-
+ for raw_job in response.json["records"]:
+ raw_job["build_id"] = parent.domain_id()
+ yield raw_job, state
def convert(self, j: Job, ctx: Context) -> Iterable[devops.CICDPipeline]:
result = None
diff --git
a/backend/python/plugins/azuredevops/azuredevops/streams/pull_request_commits.py
b/backend/python/plugins/azuredevops/azuredevops/streams/pull_request_commits.py
index 07fe9c78f..14007b36d 100644
---
a/backend/python/plugins/azuredevops/azuredevops/streams/pull_request_commits.py
+++
b/backend/python/plugins/azuredevops/azuredevops/streams/pull_request_commits.py
@@ -35,22 +35,8 @@ class GitPullRequestCommits(Substream):
raw_commit["pull_request_id"] = parent.domain_id()
yield raw_commit, state
- def extract(self, raw_data: dict) -> GitPullRequestCommit:
- return GitPullRequestCommit(
- **raw_data,
- commit_sha = raw_data["commitId"],
- author_name = raw_data["author"]["name"],
- author_email = raw_data["author"]["email"],
- authored_date = raw_data["author"]["date"],
- committer_name = raw_data["committer"]["name"],
- committer_email = raw_data["committer"]["email"],
- commit_date = raw_data["committer"]["date"],
- additions = raw_data["changeCounts"]["Add"] if "changeCounts" in
raw_data else 0,
- deletions = raw_data["changeCounts"]["Delete"] if "changeCounts"
in raw_data else 0
- )
-
def convert(self, commit: GitPullRequestCommit, context) ->
Iterable[code.PullRequestCommit]:
yield code.PullRequestCommit(
- commit_sha=commit.commit_sha,
+ commit_sha=commit.commit_id,
pull_request_id=commit.pull_request_id,
)
diff --git
a/backend/python/plugins/azuredevops/azuredevops/streams/pull_requests.py
b/backend/python/plugins/azuredevops/azuredevops/streams/pull_requests.py
index 446bf031a..b7ab834be 100644
--- a/backend/python/plugins/azuredevops/azuredevops/streams/pull_requests.py
+++ b/backend/python/plugins/azuredevops/azuredevops/streams/pull_requests.py
@@ -17,7 +17,7 @@ from typing import Iterable
from azuredevops.api import AzureDevOpsAPI
from azuredevops.models import GitRepository, GitPullRequest
-from pydevlake import Stream, DomainType
+from pydevlake import Stream, DomainType, domain_id
import pydevlake.domain_layer.code as code
@@ -32,33 +32,30 @@ class GitPullRequests(Stream):
for raw_pr in response:
yield raw_pr, state
- def extract(self, raw_data: dict) -> GitPullRequest:
- pr = GitPullRequest(**raw_data)
- pr.id = raw_data["pullRequestId"]
- pr.created_by_id = raw_data["createdBy"]["id"]
- pr.created_by_name = raw_data["createdBy"]["displayName"]
- pr.repo_id = raw_data["repository"]["id"]
- pr.source_commit_sha = raw_data["lastMergeSourceCommit"]["commitId"]
- pr.target_commit_sha = raw_data["lastMergeTargetCommit"]["commitId"]
- pr.merge_commit_sha = raw_data["lastMergeCommit"]["commitId"]
- if "labels" in raw_data:
- # TODO get this off transformation rules regex
- pr.type = raw_data["labels"][0]["name"]
- if "forkSource" in raw_data:
- pr.fork_repo_id = raw_data["forkSource"]["repository"]["id"]
- return pr
-
def convert(self, pr: GitPullRequest, ctx):
+ repo_id = ctx.scope.domain_id()
+ # If the PR is from a fork, we forge a new repo ID for the base repo
but it doesn't correspond to a real repo
+ base_repo_id = domain_id(GitRepository, ctx.connection.id,
pr.fork_repo_id) if pr.fork_repo_id is not None else repo_id
+
+ # Use the same status values as GitHub plugin
+ status = None
+ if pr.status == GitPullRequest.Status.Abandoned:
+ status = 'CLOSED'
+ elif pr.status == GitPullRequest.Status.Active:
+ status = 'OPEN'
+ elif pr.status == GitPullRequest.Status.Completed:
+ status = 'MERGED'
+
yield code.PullRequest(
- base_repo_id=(pr.fork_repo_id if pr.fork_repo_id is not None else
pr.repo_id),
- head_repo_id=pr.repo_id,
- status=pr.status.value,
+ base_repo_id=base_repo_id,
+ head_repo_id=repo_id,
+ status=status,
title=pr.title,
description=pr.description,
url=pr.url,
author_name=pr.created_by_name,
author_id=pr.created_by_id,
- pull_request_key=pr.id,
+ pull_request_key=pr.pull_request_id,
created_date=pr.creation_date,
merged_date=pr.closed_date,
closed_date=pr.closed_date,
diff --git a/backend/python/plugins/azuredevops/poetry.lock
b/backend/python/plugins/azuredevops/poetry.lock
index 63c0e1b47..85afd5cb0 100644
--- a/backend/python/plugins/azuredevops/poetry.lock
+++ b/backend/python/plugins/azuredevops/poetry.lock
@@ -268,6 +268,18 @@ files = [
{file = "iso8601-1.1.0.tar.gz", hash =
"sha256:32811e7b81deee2063ea6d2e94f8819a86d1f3811e49d23623a41fa832bef03f"},
]
+[[package]]
+name = "jsonpointer"
+version = "2.3"
+description = "Identify specific nodes in a JSON document (RFC 6901)"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+ {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash =
"sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"},
+ {file = "jsonpointer-2.3.tar.gz", hash =
"sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"},
+]
+
[[package]]
name = "jsonref"
version = "1.1.0"
@@ -425,6 +437,7 @@ develop = true
[package.dependencies]
fire = "^0.4.0"
inflect = "^6.0.2"
+jsonpointer = "^2.3"
jsonref = "^1.1.0"
mysqlclient = "^2.1.1"
psycopg2 = "^2.9.5"
diff --git a/backend/python/plugins/azuredevops/tests/streams_test.py
b/backend/python/plugins/azuredevops/tests/streams_test.py
index 6725e45d5..9d95960e5 100644
--- a/backend/python/plugins/azuredevops/tests/streams_test.py
+++ b/backend/python/plugins/azuredevops/tests/streams_test.py
@@ -197,7 +197,7 @@ def test_jobs_stream(context):
assert_stream_convert(AzureDevOpsPlugin, 'jobs', raw, expected, context)
-def test_pull_requests_stream():
+def test_pull_requests_stream(context):
raw = {
'repository': {
'id': '0d50ba13-f9ad-49b0-9b21-d29eda50ca33',
@@ -273,9 +273,9 @@ def test_pull_requests_stream():
}
expected = code.PullRequest(
- base_repo_id='0d50ba13-f9ad-49b0-9b21-d29eda50ca33',
- head_repo_id='0d50ba13-f9ad-49b0-9b21-d29eda50ca33',
- status='active',
+ base_repo_id='azuredevops:GitRepository:1:johndoe/test-repo',
+ head_repo_id='azuredevops:GitRepository:1:johndoe/test-repo',
+ status='OPEN',
title='ticket-2 PR',
description='Updated main.java by ticket-2',
url='https://dev.azure.com/johndoe/7a3fd40e-2aed-4fac-bac9-511bf1a70206/_apis/git/repositories/0d50ba13-f9ad-49b0-9b21-d29eda50ca33/pullRequests/1',
@@ -294,7 +294,7 @@ def test_pull_requests_stream():
base_commit_sha='4bc26d92b5dbee7837a4d221035a4e2f8df120b2'
)
- assert_stream_convert(AzureDevOpsPlugin, 'gitpullrequests', raw, expected)
+ assert_stream_convert(AzureDevOpsPlugin, 'gitpullrequests', raw, expected,
context)
def test_pull_request_commits_stream():
diff --git a/backend/python/pydevlake/poetry.lock
b/backend/python/pydevlake/poetry.lock
index 3c693468c..301036519 100644
--- a/backend/python/pydevlake/poetry.lock
+++ b/backend/python/pydevlake/poetry.lock
@@ -256,6 +256,18 @@ files = [
{file = "iniconfig-2.0.0.tar.gz", hash =
"sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
]
+[[package]]
+name = "jsonpointer"
+version = "2.3"
+description = "Identify specific nodes in a JSON document (RFC 6901)"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+ {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash =
"sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"},
+ {file = "jsonpointer-2.3.tar.gz", hash =
"sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"},
+]
+
[[package]]
name = "jsonref"
version = "1.1.0"
@@ -623,4 +635,4 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[metadata]
lock-version = "2.0"
python-versions = "~3.9"
-content-hash =
"75a142aa83def49843dbd04fbaea54779d3cc12398fc484bda5be0e7ebc28e48"
+content-hash =
"cb0be45618de6cb1105d292164427e16a515938e44291039a8e0f0cce32fa26d"
diff --git a/backend/python/pydevlake/pydevlake/__init__.py
b/backend/python/pydevlake/pydevlake/__init__.py
index 237c345d5..d645c0749 100644
--- a/backend/python/pydevlake/pydevlake/__init__.py
+++ b/backend/python/pydevlake/pydevlake/__init__.py
@@ -13,10 +13,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+from typing import Any, Optional
+
import pytest
pytest.register_assert_rewrite('pydevlake.testing')
-from .model import ToolModel, ToolScope, DomainScope, Connection,
TransformationRule
+from sqlmodel import Field as _Field
+
+
+def Field(*args, schema_extra: Optional[dict[str, Any]]=None, source:
Optional[str]=None, **kwargs):
+ """
+ A wrapper around sqlmodel.Field that adds a source parameter.
+ """
+ schema_extra = schema_extra or {}
+ if source:
+ schema_extra['source'] = source
+ return _Field(*args, **kwargs, schema_extra=schema_extra)
+
+
+from .model import ToolModel, ToolScope, DomainScope, Connection,
TransformationRule, domain_id
from .logger import logger
from .message import RemoteScopeGroup
from .plugin import Plugin, ScopeTxRulePair
diff --git a/backend/python/pydevlake/pydevlake/extractor.py
b/backend/python/pydevlake/pydevlake/extractor.py
index 8ee42914a..5943e1b86 100644
--- a/backend/python/pydevlake/pydevlake/extractor.py
+++ b/backend/python/pydevlake/pydevlake/extractor.py
@@ -15,15 +15,45 @@
from typing import Type
+
+from jsonpointer import resolve_pointer, JsonPointerException
+
from pydevlake import ToolModel
def autoextract(json: dict, model_cls: Type[ToolModel]) -> ToolModel:
- annotations = dict(model_cls.__annotations__)
- for key, value in json.items():
- if key in annotations:
- expected_type = annotations[key]
- if isinstance(expected_type, type) and issubclass(expected_type,
ToolModel):
- # TODO: replace with actual foreign key
- json[key] = value["id"]
- return model_cls(**json)
+ """
+ Automatically extract a tool model from a json object.
+ The tool model class can define fields with a source argument to specify
the JSON pointer (RFC 6901) to the value.
+
+ Example:
+ class DummyModel(ToolModel):
+ name: str
+ version: str = Field(source='/version/number')
+
+ json = {
+ 'name': 'test',
+ 'version': {
+ 'number': '1.0.0',
+ 'build_date': '2023-04-19'
+ }
+ }
+
+ model = autoextract(json, DummyModel)
+ """
+ attributes = {}
+ for field in model_cls.__fields__.values():
+ pointer = field.field_info.extra.get('source')
+
+ if pointer:
+ if field.required:
+ try:
+ value = resolve_pointer(json, pointer)
+ except JsonPointerException:
+ raise ValueError(f"Missing required value for field
{field.name} at {pointer}")
+ else:
+ value = resolve_pointer(json, pointer, field.default)
+ else:
+ value = json.get(field.name) or json.get(field.alias)
+ attributes[field.name] = value
+ return model_cls(**attributes)
diff --git a/backend/python/pydevlake/pydevlake/model.py
b/backend/python/pydevlake/pydevlake/model.py
index e45dd893e..51df93051 100644
--- a/backend/python/pydevlake/pydevlake/model.py
+++ b/backend/python/pydevlake/pydevlake/model.py
@@ -15,13 +15,13 @@
import os
-from typing import Optional
+from typing import Iterable, Optional
from inspect import getmodule
from datetime import datetime
import inflect
from pydantic import AnyUrl, validator
-from sqlalchemy import Column, DateTime, func
+from sqlalchemy import Column, DateTime
from sqlalchemy.orm import declared_attr
from sqlalchemy.inspection import inspect
from sqlmodel import SQLModel, Field
@@ -107,16 +107,16 @@ class ToolModel(ToolTable, NoPKModel):
Generate an identifier for domain entities
originates from self.
"""
+ return domain_id(type(self), self.connection_id, *self.primary_keys())
+
+ def primary_keys(self) -> Iterable[object]:
model_type = type(self)
- segments = [_get_plugin_name(model_type), model_type.__name__,
str(self.connection_id)]
mapper = inspect(model_type)
for primary_key_column in mapper.primary_key:
prop = mapper.get_property_by_column(primary_key_column)
if prop.key == 'connection_id':
continue
- attr_val = getattr(self, prop.key)
- segments.append(str(attr_val))
- return ':'.join(segments)
+ yield getattr(self, prop.key)
class Config:
allow_population_by_field_name = True
@@ -129,7 +129,6 @@ class ToolModel(ToolTable, NoPKModel):
return parts[0] + ''.join(word.capitalize() for word in parts[1:])
-
class DomainModel(NoPKModel):
id: Optional[str] = Field(primary_key=True)
@@ -143,6 +142,16 @@ class DomainScope(DomainModel):
pass
+def domain_id(model_type, connection_id, *args):
+ """
+ Generate an identifier for domain entities
+ originates from a model of type model_type.
+ """
+ segments = [_get_plugin_name(model_type), model_type.__name__,
str(connection_id)]
+ segments.extend(str(arg) for arg in args)
+ return ':'.join(segments)
+
+
def _get_plugin_name(cls):
"""
Get the plugin name from a class by looking into
diff --git a/backend/python/pydevlake/pydevlake/stream.py
b/backend/python/pydevlake/pydevlake/stream.py
index 54c8288de..d6df9f7f4 100644
--- a/backend/python/pydevlake/pydevlake/stream.py
+++ b/backend/python/pydevlake/pydevlake/stream.py
@@ -21,6 +21,7 @@ from enum import Enum
from pydevlake.context import Context
from pydevlake.subtasks import Collector, Extractor, Convertor,
SubstreamCollector
from pydevlake.model import RawModel, ToolModel, ToolScope, DomainModel
+from pydevlake.extractor import autoextract
class DomainType(Enum):
@@ -86,7 +87,7 @@ class Stream:
pass
def extract(self, raw_data: dict) -> ToolModel:
- return self.tool_model(**raw_data)
+ return autoextract(raw_data, self.tool_model)
def convert(self, tool_model: ToolModel, context: Context) -> DomainModel:
pass
diff --git a/backend/python/pydevlake/pyproject.toml
b/backend/python/pydevlake/pyproject.toml
index 0bfcadbb4..6cd69b77a 100644
--- a/backend/python/pydevlake/pyproject.toml
+++ b/backend/python/pydevlake/pyproject.toml
@@ -33,6 +33,7 @@ pydevd-pycharm = "^231.6471.3"
pytest = "^7.2.2"
jsonref = "^1.1.0"
psycopg2 = "^2.9.5"
+jsonpointer = "^2.3"
[tool.poetry.group.dev.dependencies]
diff --git a/backend/python/pydevlake/tests/extractor_test.py
b/backend/python/pydevlake/tests/extractor_test.py
new file mode 100644
index 000000000..48a1d91de
--- /dev/null
+++ b/backend/python/pydevlake/tests/extractor_test.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+from pydevlake import ToolModel, Field
+from pydevlake.extractor import autoextract
+import pytest
+
+
+def test_autoextract_from_json_pointer():
+ class DummyModel(ToolModel):
+ name: str
+ version: str = Field(source='/version/number')
+
+ json = {
+ 'name': 'test',
+ 'version': {
+ 'number': '1.0.0',
+ 'build_date': '2023-04-19'
+ }
+ }
+ model = autoextract(json, DummyModel)
+ assert model.name == 'test'
+ assert model.version == '1.0.0'
+
+
+def test_autoextract_optional_field_with_missing_value():
+ class DummyModel(ToolModel):
+ name: str
+ version: Optional[str] = Field(source='/version/number')
+
+ json = {
+ 'name': 'test',
+ 'version': {
+ # missing 'number'
+ 'build_date': '2023-04-19'
+ }
+ }
+ model = autoextract(json, DummyModel)
+ assert model.name == 'test'
+ assert model.version == None
+
+
+def test_autoextract_optional_field_with_default_with_missing_value():
+ class DummyModel(ToolModel):
+ name: str
+ version: Optional[str] = Field(source='/version/number',
default='1.0.0')
+
+ json = {
+ 'name': 'test',
+ 'version': {
+ # missing 'number'
+ 'build_date': '2023-04-19'
+ }
+ }
+ model = autoextract(json, DummyModel)
+ assert model.name == 'test'
+ assert model.version == '1.0.0'
+
+
+
+def test_autoextract_required_field_and_missing_field():
+ class DummyModel(ToolModel):
+ name: str
+ version: str = Field(source='/version/number')
+
+ json = {
+ 'name': 'test',
+ 'version': {
+ # missing 'number'
+ 'build_date': '2023-04-19'
+ }
+ }
+
+ pytest.raises(ValueError, autoextract, json, DummyModel)
diff --git a/backend/python/test/fakeplugin/poetry.lock
b/backend/python/test/fakeplugin/poetry.lock
index a9067012c..dcceb7d68 100644
--- a/backend/python/test/fakeplugin/poetry.lock
+++ b/backend/python/test/fakeplugin/poetry.lock
@@ -256,6 +256,18 @@ files = [
{file = "iniconfig-2.0.0.tar.gz", hash =
"sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
]
+[[package]]
+name = "jsonpointer"
+version = "2.3"
+description = "Identify specific nodes in a JSON document (RFC 6901)"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+ {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash =
"sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"},
+ {file = "jsonpointer-2.3.tar.gz", hash =
"sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"},
+]
+
[[package]]
name = "jsonref"
version = "1.1.0"
@@ -413,6 +425,7 @@ develop = true
[package.dependencies]
fire = "^0.4.0"
inflect = "^6.0.2"
+jsonpointer = "^2.3"
jsonref = "^1.1.0"
mysqlclient = "^2.1.1"
psycopg2 = "^2.9.5"