This is an automated email from the ASF dual-hosted git repository.

hez pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-devlake.git


The following commit(s) were added to refs/heads/main by this push:
     new 8c86c2cd0 4879 nested property extraction with json pointer (#4976)
8c86c2cd0 is described below

commit 8c86c2cd0018e9f58ddf0ec934ece3c6b09dca9d
Author: Camille Teruel <[email protected]>
AuthorDate: Sat Apr 22 07:22:49 2023 +0200

    4879 nested property extraction with json pointer (#4976)
    
    * fix: PullRequest base and head repo ids must be domain ids
    
    * style: Use snake_cased attributes
    
    * feat: Use same PR state values as Github plugin
    
    * refactor: Simplify extractors and clean up models
    
    * refactor: Remove unused db.py
    
    * feat: Autoextraction via JSON-pointer
    
    * refactor: Add field source to models to remove extract methods
    
    ---------
    
    Co-authored-by: Camille Teruel <[email protected]>
---
 backend/python/README.md                           |  18 +++-
 .../plugins/azuredevops/azuredevops/helper/db.py   |  28 -----
 .../python/plugins/azuredevops/azuredevops/main.py |   6 +-
 .../plugins/azuredevops/azuredevops/models.py      | 117 +++------------------
 .../azuredevops/azuredevops/streams/builds.py      |  28 ++---
 .../azuredevops/azuredevops/streams/jobs.py        |  10 +-
 .../azuredevops/streams/pull_request_commits.py    |  16 +--
 .../azuredevops/streams/pull_requests.py           |  39 ++++---
 backend/python/plugins/azuredevops/poetry.lock     |  13 +++
 .../plugins/azuredevops/tests/streams_test.py      |  10 +-
 backend/python/pydevlake/poetry.lock               |  14 ++-
 backend/python/pydevlake/pydevlake/__init__.py     |  17 ++-
 backend/python/pydevlake/pydevlake/extractor.py    |  46 ++++++--
 backend/python/pydevlake/pydevlake/model.py        |  23 ++--
 backend/python/pydevlake/pydevlake/stream.py       |   3 +-
 backend/python/pydevlake/pyproject.toml            |   1 +
 backend/python/pydevlake/tests/extractor_test.py   |  87 +++++++++++++++
 backend/python/test/fakeplugin/poetry.lock         |  13 +++
 18 files changed, 264 insertions(+), 225 deletions(-)

diff --git a/backend/python/README.md b/backend/python/README.md
index d0658c94a..062f6053e 100644
--- a/backend/python/README.md
+++ b/backend/python/README.md
@@ -233,7 +233,7 @@ class Users(Stream):
     def collect(self, state, context) -> Iterable[Tuple[object, dict]]:
         pass
 
-    def extract(self, raw_data, context) -> ToolUser:
+    def extract(self, raw_data) -> ToolUser:
         pass
 
     def convert(self, user: ToolUser, context) -> Iterable[DomainUser]:
@@ -251,9 +251,19 @@ The `collect` method takes a `state` dictionary and a 
context object and yields
 The last state that the plugin yielded for a given connection will be reused 
during the next collection.
 The plugin can use this `state` to store information necessary to perform 
incremental collection of data.
 
-The `extract` method takes a raw data object and a context object and returns 
a tool model. This method has a default implementation that uses the 
`tool_model` class attribute to create a new instance of the tool model and set 
its attributes from the raw data (`self.tool_model(**raw_data)`).
-If the raw data collected from the datasource and is simple enough and well 
aligned with your tool model, you can omit this method.
-Otherwise, you can override it to deal with e.g. nested data structures.
+The `extract` method takes a raw data object and returns a tool model.
+This method has a default implementation that populates an instance of the 
`tool_model` class with the raw data.
+When you need to extract a nested value from JSON raw data, you can specify a 
JSON pointer (see RFC 6901) in the as `source` argument to a `Field` 
declaration.
+
+```python
+class User(ToolModel, table=True):
+    id: str = Field(primary_key=True)
+    name: str
+    email: str
+    address: str = Field(source="/contactInfo/address")
+```
+
+Here the address field will be populated with the value of the `address` 
property of the `contactInfo` object property of the JSON object.
 
 The `convert` method takes a tool-specific user model and convert it into 
domain level user models.
 Here the two models align quite well, the conversion is trivial:
diff --git a/backend/python/plugins/azuredevops/azuredevops/helper/db.py 
b/backend/python/plugins/azuredevops/azuredevops/helper/db.py
deleted file mode 100644
index 0c1f81be3..000000000
--- a/backend/python/plugins/azuredevops/azuredevops/helper/db.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Type
-
-from sqlalchemy import sql
-from sqlmodel import Session
-
-from pydevlake import Context
-
-
-def get(ctx: Context, model_type: Type, *query) -> any:
-    with Session(ctx.engine) as session:
-        stmt = sql.select(model_type).filter(*query)
-        model = session.exec(stmt).scalar()
-        return model
diff --git a/backend/python/plugins/azuredevops/azuredevops/main.py 
b/backend/python/plugins/azuredevops/azuredevops/main.py
index 17a27a815..42b98f186 100644
--- a/backend/python/plugins/azuredevops/azuredevops/main.py
+++ b/backend/python/plugins/azuredevops/azuredevops/main.py
@@ -47,7 +47,7 @@ class AzureDevOpsPlugin(Plugin):
         yield Repo(
             name=git_repo.name,
             url=git_repo.url,
-            forked_from=git_repo.parentRepositoryUrl
+            forked_from=git_repo.parent_repository_url
         )
 
         yield CicdScope(
@@ -85,10 +85,10 @@ class AzureDevOpsPlugin(Plugin):
             url = url._replace(netloc=url.hostname)
             raw_repo['url'] = url.geturl()
             repo = GitRepository(**raw_repo)
-            if not repo.defaultBranch:
+            if not repo.default_branch:
                 continue
             if "parentRepository" in raw_repo:
-                repo.parentRepositoryUrl = raw_repo["parentRepository"]["url"]
+                repo.parent_repository_url = 
raw_repo["parentRepository"]["url"]
             yield repo
 
     def test_connection(self, connection: AzureDevOpsConnection):
diff --git a/backend/python/plugins/azuredevops/azuredevops/models.py 
b/backend/python/plugins/azuredevops/azuredevops/models.py
index c374754e7..b194f0cf5 100644
--- a/backend/python/plugins/azuredevops/azuredevops/models.py
+++ b/backend/python/plugins/azuredevops/azuredevops/models.py
@@ -18,9 +18,7 @@ from enum import Enum
 from typing import Optional
 import re
 
-from sqlmodel import Field
-
-from pydevlake import Connection, TransformationRule
+from pydevlake import Field, Connection, TransformationRule
 from pydevlake.model import ToolModel, ToolScope
 from pydevlake.pipeline_tasks import RefDiffOptions
 
@@ -36,109 +34,52 @@ class AzureDevOpsTransformationRule(TransformationRule):
     production_pattern: Optional[re.Pattern]
 
 
-class Project(ToolModel, table=True):
-    id: str = Field(primary_key=True)
-    name: str
-    url: str
-
-
 class GitRepository(ToolScope, table=True):
     url: str
-    sshUrl: str
     remoteUrl: str
-    defaultBranch: Optional[str]
-    project_id: str  # = Field(foreign_key=Project.id)
+    default_branch: Optional[str]
+    project_id: str
     org_id: str
-    size: int
-    isDisabled: bool
-    isInMaintenance: bool
-    isFork: Optional[bool]
-    parentRepositoryUrl: Optional[str]
+    parent_repository_url: Optional[str] = Field(source='parentRepository/url')
 
 
-# 
https://learn.microsoft.com/en-us/rest/api/azure/devops/git/pull-requests/get-pull-requests?view=azure-devops-rest-7.1&tabs=HTTP#identityrefwithvote
 class GitPullRequest(ToolModel, table=True):
     class Status(Enum):
         Abandoned = "abandoned"
         Active = "active"
-        All = "all"
         Completed = "completed"
-        NotSet = "notSet"
 
-    id: int = Field(primary_key=True)
-    project_id: Optional[str]
+    pull_request_id: int = Field(primary_key=True)
     description: Optional[str]
-    code_review_id: int = 0
-    repo_id: Optional[str]
     status: Status
-    created_by_id: Optional[str]
-    created_by_name: Optional[str]
+    created_by_id: str = Field(source='/createdBy/id')
+    created_by_name: str = Field(source='/createdBy/displayName')
     creation_date: datetime.datetime
     closed_date: Optional[datetime.datetime]
-    source_commit_sha: Optional[str]  # lastmergesourcecommit #base
-    target_commit_sha: Optional[str]  # lastmergetargetcommit #head
-    merge_commit_sha: Optional[str]
+    source_commit_sha: str = Field(source='/lastMergeSourceCommit/commitId')
+    target_commit_sha: str = Field(source='/lastMergeTargetCommit/commitId')
+    merge_commit_sha: str = Field(source='/lastMergeCommit/commitId')
     url: Optional[str]
-    type: Optional[str]
+    type: Optional[str] = Field(source='/labels/0/name') # TODO: get this off 
transformation rules regex
     title: Optional[str]
     target_ref_name: Optional[str]
     source_ref_name: Optional[str]
-    fork_repo_id: Optional[str]
+    fork_repo_id: Optional[str] = Field(source='/forkSource/repository/id')
 
 
 class GitPullRequestCommit(ToolModel, table=True):
-    commit_sha: str = Field(primary_key=True)
+    commit_id: str = Field(primary_key=True)
     pull_request_id: str
-    committer_name: str
-    committer_email: str
-    commit_date: datetime.datetime
-    author_name: str
-    author_email: str
-    authored_date: datetime.datetime
-    comment: str
-    url: str
-    additions: int
-    deletions: int
-
-
-class Account(ToolModel, table=True):
-    class Type(Enum):
-        Organization = "organization"
-        Personal = "personal"
-
-    class Status(Enum):
-        Deleted = "deleted"
-        Disabled = "disabled"
-        Enabled = "enabled"
-        Moved = "moved"
-        Non = "none"
-
-    account_id: str = Field(primary_key=True)
-    account_name: str
-    account_owner: str
-    account_type: Type
-    account_status: Status
-    organization_name: str
-    namespace_id: str
 
 
 class Build(ToolModel, table=True):
     class Status(Enum):
-        All = "all"
         Cancelling = "cancelling"
         Completed = "completed"
         InProgress = "inProgress"
-        Non = "none"
         NotStarted = "notStarted"
         Postponed = "postponed"
 
-    class Priority(Enum):
-        AboveNormal = "aboveNormal"
-        BelowNormal = "belowNormal"
-        High = "high"
-        Low = "low"
-        Normal = "normal"
-
     class Result(Enum):
         Canceled = "canceled"
         Failed = "failed"
@@ -147,33 +88,16 @@ class Build(ToolModel, table=True):
         Succeeded = "succeeded"
 
     id: int = Field(primary_key=True)
-    name: str
-    project_id: str
-    repo_id: str
-    repo_type: str
-    build_number: str
-    build_number_revision: Optional[str]
-    controller_id: Optional[str]
-    definition_id: Optional[str]
-    deleted: Optional[bool]
+    name: str = Field(source='/definition/name')
     start_time: Optional[datetime.datetime]
     finish_time: Optional[datetime.datetime]
     status: Status
-    tags: list[str] = []
-    priority: Priority
-    build_result: Result
+    result: Result
     source_branch: str
     source_version: str
 
 
 class Job(ToolModel, table=True):
-    class Type(Enum):
-        Task = "Task"
-        Job = "Job"
-        Checkpoint = "Checkpoint"
-        Stage = "Stage"
-        Phase = "Phase"
-
     class State(Enum):
         Completed = "completed"
         InProgress = "inProgress"
@@ -189,19 +113,8 @@ class Job(ToolModel, table=True):
 
     id: str = Field(primary_key=True)
     build_id: str
-    parentId: Optional[str]
-    type: Optional[Type]
     name: str
     startTime: datetime.datetime
     finishTime: datetime.datetime
-    lastModified: datetime.datetime
-    currentOperation: Optional[int]
-    percentComplete: Optional[int]
     state: State
     result: Result
-    resultCode: Optional[int]
-    changeId: Optional[int]
-    workerName: Optional[str]
-    order: Optional[int]
-    errorCount: Optional[int]
-    warningCount: Optional[int]
diff --git a/backend/python/plugins/azuredevops/azuredevops/streams/builds.py 
b/backend/python/plugins/azuredevops/azuredevops/streams/builds.py
index bc105cc62..006d3315a 100644
--- a/backend/python/plugins/azuredevops/azuredevops/streams/builds.py
+++ b/backend/python/plugins/azuredevops/azuredevops/streams/builds.py
@@ -20,7 +20,7 @@ import iso8601 as iso8601
 from azuredevops.api import AzureDevOpsAPI
 from azuredevops.models import GitRepository
 from azuredevops.models import Build
-from pydevlake import Context, DomainType, Stream, logger
+from pydevlake import Context, DomainType, Stream
 import pydevlake.domain_layer.devops as devops
 
 
@@ -35,35 +35,19 @@ class Builds(Stream):
         for raw_build in response:
             yield raw_build, state
 
-    def extract(self, raw_data: dict) -> Build:
-        build: Build = self.tool_model(**raw_data)
-        build.name = raw_data["definition"]["name"]
-        build.project_id = raw_data["project"]["id"]
-        build.repo_id = raw_data["repository"]["id"]
-        build.repo_type = raw_data["repository"]["type"]
-        build.build_number = raw_data["buildNumber"]
-        build.tags = ",".join(raw_data["tags"])
-        build.build_result = Build.Result(raw_data["result"])
-        trigger_info: dict = raw_data["triggerInfo"]
-        if "ci.sourceSha" in trigger_info: # this key is not guaranteed to be 
in here per docs
-            assert build.source_version == trigger_info["ci.sourceSha"]
-        return build
-
     def convert(self, b: Build, ctx: Context):
         result = None
-        if b.build_result == Build.Result.Canceled:
+        if b.result == Build.Result.Canceled:
             result = devops.CICDResult.ABORT
-        elif b.build_result == Build.Result.Failed:
+        elif b.result == Build.Result.Failed:
             result = devops.CICDResult.FAILURE
-        elif b.build_result == Build.Result.PartiallySucceeded:
+        elif b.result == Build.Result.PartiallySucceeded:
             result = devops.CICDResult.SUCCESS
-        elif b.build_result ==  Build.Result.Succeeded:
+        elif b.result ==  Build.Result.Succeeded:
             result = devops.CICDResult.SUCCESS
 
         status = None
-        if b.status == Build.Status.All:
-            status = devops.CICDStatus.IN_PROGRESS
-        elif b.status == Build.Status.Cancelling:
+        if b.status == Build.Status.Cancelling:
             status = devops.CICDStatus.DONE
         elif b.status == Build.Status.Completed:
             status = devops.CICDStatus.DONE
diff --git a/backend/python/plugins/azuredevops/azuredevops/streams/jobs.py 
b/backend/python/plugins/azuredevops/azuredevops/streams/jobs.py
index 89e6e4a25..131739551 100644
--- a/backend/python/plugins/azuredevops/azuredevops/streams/jobs.py
+++ b/backend/python/plugins/azuredevops/azuredevops/streams/jobs.py
@@ -31,13 +31,9 @@ class Jobs(Substream):
         repo: GitRepository = context.scope
         api = AzureDevOpsAPI(context.connection)
         response = api.jobs(repo.org_id, repo.project_id, parent.id)
-        if response.status != 200:
-            yield None, state
-        else:
-            for raw_job in response.json["records"]:
-                raw_job["build_id"] = parent.domain_id()
-                yield raw_job, state
-
+        for raw_job in response.json["records"]:
+            raw_job["build_id"] = parent.domain_id()
+            yield raw_job, state
 
     def convert(self, j: Job, ctx: Context) -> Iterable[devops.CICDPipeline]:
         result = None
diff --git 
a/backend/python/plugins/azuredevops/azuredevops/streams/pull_request_commits.py
 
b/backend/python/plugins/azuredevops/azuredevops/streams/pull_request_commits.py
index 07fe9c78f..14007b36d 100644
--- 
a/backend/python/plugins/azuredevops/azuredevops/streams/pull_request_commits.py
+++ 
b/backend/python/plugins/azuredevops/azuredevops/streams/pull_request_commits.py
@@ -35,22 +35,8 @@ class GitPullRequestCommits(Substream):
             raw_commit["pull_request_id"] = parent.domain_id()
             yield raw_commit, state
 
-    def extract(self, raw_data: dict) -> GitPullRequestCommit:
-        return GitPullRequestCommit(
-            **raw_data,
-            commit_sha = raw_data["commitId"],
-            author_name = raw_data["author"]["name"],
-            author_email = raw_data["author"]["email"],
-            authored_date = raw_data["author"]["date"],
-            committer_name = raw_data["committer"]["name"],
-            committer_email = raw_data["committer"]["email"],
-            commit_date = raw_data["committer"]["date"],
-            additions = raw_data["changeCounts"]["Add"] if "changeCounts" in 
raw_data else 0,
-            deletions = raw_data["changeCounts"]["Delete"] if "changeCounts" 
in raw_data else 0
-        )
-
     def convert(self, commit: GitPullRequestCommit, context) -> 
Iterable[code.PullRequestCommit]:
         yield code.PullRequestCommit(
-            commit_sha=commit.commit_sha,
+            commit_sha=commit.commit_id,
             pull_request_id=commit.pull_request_id,
         )
diff --git 
a/backend/python/plugins/azuredevops/azuredevops/streams/pull_requests.py 
b/backend/python/plugins/azuredevops/azuredevops/streams/pull_requests.py
index 446bf031a..b7ab834be 100644
--- a/backend/python/plugins/azuredevops/azuredevops/streams/pull_requests.py
+++ b/backend/python/plugins/azuredevops/azuredevops/streams/pull_requests.py
@@ -17,7 +17,7 @@ from typing import Iterable
 
 from azuredevops.api import AzureDevOpsAPI
 from azuredevops.models import GitRepository, GitPullRequest
-from pydevlake import Stream, DomainType
+from pydevlake import Stream, DomainType, domain_id
 import pydevlake.domain_layer.code as code
 
 
@@ -32,33 +32,30 @@ class GitPullRequests(Stream):
         for raw_pr in response:
             yield raw_pr, state
 
-    def extract(self, raw_data: dict) -> GitPullRequest:
-        pr = GitPullRequest(**raw_data)
-        pr.id = raw_data["pullRequestId"]
-        pr.created_by_id = raw_data["createdBy"]["id"]
-        pr.created_by_name = raw_data["createdBy"]["displayName"]
-        pr.repo_id = raw_data["repository"]["id"]
-        pr.source_commit_sha = raw_data["lastMergeSourceCommit"]["commitId"]
-        pr.target_commit_sha = raw_data["lastMergeTargetCommit"]["commitId"]
-        pr.merge_commit_sha = raw_data["lastMergeCommit"]["commitId"]
-        if "labels" in raw_data:
-            # TODO get this off transformation rules regex
-            pr.type = raw_data["labels"][0]["name"]
-        if "forkSource" in raw_data:
-            pr.fork_repo_id = raw_data["forkSource"]["repository"]["id"]
-        return pr
-
     def convert(self, pr: GitPullRequest, ctx):
+        repo_id = ctx.scope.domain_id()
+        # If the PR is from a fork, we forge a new repo ID for the base repo 
but it doesn't correspond to a real repo
+        base_repo_id = domain_id(GitRepository, ctx.connection.id, 
pr.fork_repo_id) if pr.fork_repo_id is not None else repo_id
+
+        # Use the same status values as GitHub plugin
+        status = None
+        if pr.status == GitPullRequest.Status.Abandoned:
+            status = 'CLOSED'
+        elif pr.status == GitPullRequest.Status.Active:
+            status = 'OPEN'
+        elif pr.status == GitPullRequest.Status.Completed:
+            status = 'MERGED'
+
         yield code.PullRequest(
-            base_repo_id=(pr.fork_repo_id if pr.fork_repo_id is not None else 
pr.repo_id),
-            head_repo_id=pr.repo_id,
-            status=pr.status.value,
+            base_repo_id=base_repo_id,
+            head_repo_id=repo_id,
+            status=status,
             title=pr.title,
             description=pr.description,
             url=pr.url,
             author_name=pr.created_by_name,
             author_id=pr.created_by_id,
-            pull_request_key=pr.id,
+            pull_request_key=pr.pull_request_id,
             created_date=pr.creation_date,
             merged_date=pr.closed_date,
             closed_date=pr.closed_date,
diff --git a/backend/python/plugins/azuredevops/poetry.lock 
b/backend/python/plugins/azuredevops/poetry.lock
index 63c0e1b47..85afd5cb0 100644
--- a/backend/python/plugins/azuredevops/poetry.lock
+++ b/backend/python/plugins/azuredevops/poetry.lock
@@ -268,6 +268,18 @@ files = [
     {file = "iso8601-1.1.0.tar.gz", hash = 
"sha256:32811e7b81deee2063ea6d2e94f8819a86d1f3811e49d23623a41fa832bef03f"},
 ]
 
+[[package]]
+name = "jsonpointer"
+version = "2.3"
+description = "Identify specific nodes in a JSON document (RFC 6901)"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash = 
"sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"},
+    {file = "jsonpointer-2.3.tar.gz", hash = 
"sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"},
+]
+
 [[package]]
 name = "jsonref"
 version = "1.1.0"
@@ -425,6 +437,7 @@ develop = true
 [package.dependencies]
 fire = "^0.4.0"
 inflect = "^6.0.2"
+jsonpointer = "^2.3"
 jsonref = "^1.1.0"
 mysqlclient = "^2.1.1"
 psycopg2 = "^2.9.5"
diff --git a/backend/python/plugins/azuredevops/tests/streams_test.py 
b/backend/python/plugins/azuredevops/tests/streams_test.py
index 6725e45d5..9d95960e5 100644
--- a/backend/python/plugins/azuredevops/tests/streams_test.py
+++ b/backend/python/plugins/azuredevops/tests/streams_test.py
@@ -197,7 +197,7 @@ def test_jobs_stream(context):
     assert_stream_convert(AzureDevOpsPlugin, 'jobs', raw, expected, context)
 
 
-def test_pull_requests_stream():
+def test_pull_requests_stream(context):
     raw = {
         'repository': {
             'id': '0d50ba13-f9ad-49b0-9b21-d29eda50ca33',
@@ -273,9 +273,9 @@ def test_pull_requests_stream():
     }
 
     expected = code.PullRequest(
-        base_repo_id='0d50ba13-f9ad-49b0-9b21-d29eda50ca33',
-        head_repo_id='0d50ba13-f9ad-49b0-9b21-d29eda50ca33',
-        status='active',
+        base_repo_id='azuredevops:GitRepository:1:johndoe/test-repo',
+        head_repo_id='azuredevops:GitRepository:1:johndoe/test-repo',
+        status='OPEN',
         title='ticket-2 PR',
         description='Updated main.java by ticket-2',
         
url='https://dev.azure.com/johndoe/7a3fd40e-2aed-4fac-bac9-511bf1a70206/_apis/git/repositories/0d50ba13-f9ad-49b0-9b21-d29eda50ca33/pullRequests/1',
@@ -294,7 +294,7 @@ def test_pull_requests_stream():
         base_commit_sha='4bc26d92b5dbee7837a4d221035a4e2f8df120b2'
     )
 
-    assert_stream_convert(AzureDevOpsPlugin, 'gitpullrequests', raw, expected)
+    assert_stream_convert(AzureDevOpsPlugin, 'gitpullrequests', raw, expected, 
context)
 
 
 def test_pull_request_commits_stream():
diff --git a/backend/python/pydevlake/poetry.lock 
b/backend/python/pydevlake/poetry.lock
index 3c693468c..301036519 100644
--- a/backend/python/pydevlake/poetry.lock
+++ b/backend/python/pydevlake/poetry.lock
@@ -256,6 +256,18 @@ files = [
     {file = "iniconfig-2.0.0.tar.gz", hash = 
"sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]
 
+[[package]]
+name = "jsonpointer"
+version = "2.3"
+description = "Identify specific nodes in a JSON document (RFC 6901)"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash = 
"sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"},
+    {file = "jsonpointer-2.3.tar.gz", hash = 
"sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"},
+]
+
 [[package]]
 name = "jsonref"
 version = "1.1.0"
@@ -623,4 +635,4 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "~3.9"
-content-hash = 
"75a142aa83def49843dbd04fbaea54779d3cc12398fc484bda5be0e7ebc28e48"
+content-hash = 
"cb0be45618de6cb1105d292164427e16a515938e44291039a8e0f0cce32fa26d"
diff --git a/backend/python/pydevlake/pydevlake/__init__.py 
b/backend/python/pydevlake/pydevlake/__init__.py
index 237c345d5..d645c0749 100644
--- a/backend/python/pydevlake/pydevlake/__init__.py
+++ b/backend/python/pydevlake/pydevlake/__init__.py
@@ -13,10 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Any, Optional
+
 import pytest
 pytest.register_assert_rewrite('pydevlake.testing')
 
-from .model import ToolModel, ToolScope, DomainScope, Connection, 
TransformationRule
+from sqlmodel import Field as _Field
+
+
+def Field(*args, schema_extra: Optional[dict[str, Any]]=None, source: 
Optional[str]=None, **kwargs):
+    """
+    A wrapper around sqlmodel.Field that adds a source parameter.
+    """
+    schema_extra = schema_extra or {}
+    if source:
+        schema_extra['source'] = source
+    return _Field(*args, **kwargs, schema_extra=schema_extra)
+
+
+from .model import ToolModel, ToolScope, DomainScope, Connection, 
TransformationRule, domain_id
 from .logger import logger
 from .message import RemoteScopeGroup
 from .plugin import Plugin, ScopeTxRulePair
diff --git a/backend/python/pydevlake/pydevlake/extractor.py 
b/backend/python/pydevlake/pydevlake/extractor.py
index 8ee42914a..5943e1b86 100644
--- a/backend/python/pydevlake/pydevlake/extractor.py
+++ b/backend/python/pydevlake/pydevlake/extractor.py
@@ -15,15 +15,45 @@
 
 
 from typing import Type
+
+from jsonpointer import resolve_pointer, JsonPointerException
+
 from pydevlake import ToolModel
 
 
 def autoextract(json: dict, model_cls: Type[ToolModel]) -> ToolModel:
-    annotations = dict(model_cls.__annotations__)
-    for key, value in json.items():
-        if key in annotations:
-            expected_type = annotations[key]
-            if isinstance(expected_type, type) and issubclass(expected_type, 
ToolModel):
-                # TODO: replace with actual foreign key
-                json[key] = value["id"]                    
-    return model_cls(**json)
+    """
+    Automatically extract a tool model from a json object.
+    The tool model class can define fields with a source argument to specify 
the JSON pointer (RFC 6901) to the value.
+
+    Example:
+        class DummyModel(ToolModel):
+            name: str
+            version: str = Field(source='/version/number')
+
+        json = {
+            'name': 'test',
+            'version': {
+                'number': '1.0.0',
+                'build_date': '2023-04-19'
+            }
+        }
+
+        model = autoextract(json, DummyModel)
+    """
+    attributes = {}
+    for field in model_cls.__fields__.values():
+        pointer = field.field_info.extra.get('source')
+
+        if pointer:
+            if field.required:
+                try:
+                    value = resolve_pointer(json, pointer)
+                except JsonPointerException:
+                    raise ValueError(f"Missing required value for field 
{field.name} at {pointer}")
+            else:
+                value = resolve_pointer(json, pointer, field.default)
+        else:
+            value = json.get(field.name) or json.get(field.alias)
+        attributes[field.name] = value
+    return model_cls(**attributes)
diff --git a/backend/python/pydevlake/pydevlake/model.py 
b/backend/python/pydevlake/pydevlake/model.py
index e45dd893e..51df93051 100644
--- a/backend/python/pydevlake/pydevlake/model.py
+++ b/backend/python/pydevlake/pydevlake/model.py
@@ -15,13 +15,13 @@
 
 
 import os
-from typing import Optional
+from typing import Iterable, Optional
 from inspect import getmodule
 from datetime import datetime
 
 import inflect
 from pydantic import AnyUrl, validator
-from sqlalchemy import Column, DateTime, func
+from sqlalchemy import Column, DateTime
 from sqlalchemy.orm import declared_attr
 from sqlalchemy.inspection import inspect
 from sqlmodel import SQLModel, Field
@@ -107,16 +107,16 @@ class ToolModel(ToolTable, NoPKModel):
         Generate an identifier for domain entities
         originates from self.
         """
+        return domain_id(type(self), self.connection_id, *self.primary_keys())
+
+    def primary_keys(self) -> Iterable[object]:
         model_type = type(self)
-        segments = [_get_plugin_name(model_type), model_type.__name__, 
str(self.connection_id)]
         mapper = inspect(model_type)
         for primary_key_column in mapper.primary_key:
             prop = mapper.get_property_by_column(primary_key_column)
             if prop.key == 'connection_id':
                 continue
-            attr_val = getattr(self, prop.key)
-            segments.append(str(attr_val))
-        return ':'.join(segments)
+            yield getattr(self, prop.key)
 
     class Config:
         allow_population_by_field_name = True
@@ -129,7 +129,6 @@ class ToolModel(ToolTable, NoPKModel):
             return parts[0] + ''.join(word.capitalize() for word in parts[1:])
 
 
-
 class DomainModel(NoPKModel):
     id: Optional[str] = Field(primary_key=True)
 
@@ -143,6 +142,16 @@ class DomainScope(DomainModel):
     pass
 
 
+def domain_id(model_type, connection_id, *args):
+    """
+    Generate an identifier for domain entities
+    originates from a model of type model_type.
+    """
+    segments = [_get_plugin_name(model_type), model_type.__name__, 
str(connection_id)]
+    segments.extend(str(arg) for arg in args)
+    return ':'.join(segments)
+
+
 def _get_plugin_name(cls):
     """
     Get the plugin name from a class by looking into
diff --git a/backend/python/pydevlake/pydevlake/stream.py 
b/backend/python/pydevlake/pydevlake/stream.py
index 54c8288de..d6df9f7f4 100644
--- a/backend/python/pydevlake/pydevlake/stream.py
+++ b/backend/python/pydevlake/pydevlake/stream.py
@@ -21,6 +21,7 @@ from enum import Enum
 from pydevlake.context import Context
 from pydevlake.subtasks import Collector, Extractor, Convertor, 
SubstreamCollector
 from pydevlake.model import RawModel, ToolModel, ToolScope, DomainModel
+from pydevlake.extractor import autoextract
 
 
 class DomainType(Enum):
@@ -86,7 +87,7 @@ class Stream:
         pass
 
     def extract(self, raw_data: dict) -> ToolModel:
-        return self.tool_model(**raw_data)
+        return autoextract(raw_data, self.tool_model)
 
     def convert(self, tool_model: ToolModel, context: Context) -> DomainModel:
         pass
diff --git a/backend/python/pydevlake/pyproject.toml 
b/backend/python/pydevlake/pyproject.toml
index 0bfcadbb4..6cd69b77a 100644
--- a/backend/python/pydevlake/pyproject.toml
+++ b/backend/python/pydevlake/pyproject.toml
@@ -33,6 +33,7 @@ pydevd-pycharm = "^231.6471.3"
 pytest = "^7.2.2"
 jsonref = "^1.1.0"
 psycopg2 = "^2.9.5"
+jsonpointer = "^2.3"
 
 
 [tool.poetry.group.dev.dependencies]
diff --git a/backend/python/pydevlake/tests/extractor_test.py 
b/backend/python/pydevlake/tests/extractor_test.py
new file mode 100644
index 000000000..48a1d91de
--- /dev/null
+++ b/backend/python/pydevlake/tests/extractor_test.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+from pydevlake import ToolModel, Field
+from pydevlake.extractor import autoextract
+import pytest
+
+
+def test_autoextract_from_json_pointer():
+    class DummyModel(ToolModel):
+        name: str
+        version: str = Field(source='/version/number')
+
+    json = {
+        'name': 'test',
+        'version': {
+            'number': '1.0.0',
+            'build_date': '2023-04-19'
+        }
+    }
+    model = autoextract(json, DummyModel)
+    assert model.name == 'test'
+    assert model.version == '1.0.0'
+
+
+def test_autoextract_optional_field_with_missing_value():
+    class DummyModel(ToolModel):
+        name: str
+        version: Optional[str] = Field(source='/version/number')
+
+    json = {
+        'name': 'test',
+        'version': {
+            # missing 'number'
+            'build_date': '2023-04-19'
+        }
+    }
+    model = autoextract(json, DummyModel)
+    assert model.name == 'test'
+    assert model.version == None
+
+
+def test_autoextract_optional_field_with_default_with_missing_value():
+    class DummyModel(ToolModel):
+        name: str
+        version: Optional[str] = Field(source='/version/number', 
default='1.0.0')
+
+    json = {
+        'name': 'test',
+        'version': {
+            # missing 'number'
+            'build_date': '2023-04-19'
+        }
+    }
+    model = autoextract(json, DummyModel)
+    assert model.name == 'test'
+    assert model.version == '1.0.0'
+
+
+
+def test_autoextract_required_field_and_missing_field():
+    class DummyModel(ToolModel):
+        name: str
+        version: str = Field(source='/version/number')
+
+    json = {
+        'name': 'test',
+        'version': {
+            # missing 'number'
+            'build_date': '2023-04-19'
+        }
+    }
+
+    pytest.raises(ValueError, autoextract, json, DummyModel)
diff --git a/backend/python/test/fakeplugin/poetry.lock 
b/backend/python/test/fakeplugin/poetry.lock
index a9067012c..dcceb7d68 100644
--- a/backend/python/test/fakeplugin/poetry.lock
+++ b/backend/python/test/fakeplugin/poetry.lock
@@ -256,6 +256,18 @@ files = [
     {file = "iniconfig-2.0.0.tar.gz", hash = 
"sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]
 
+[[package]]
+name = "jsonpointer"
+version = "2.3"
+description = "Identify specific nodes in a JSON document (RFC 6901)"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash = 
"sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"},
+    {file = "jsonpointer-2.3.tar.gz", hash = 
"sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"},
+]
+
 [[package]]
 name = "jsonref"
 version = "1.1.0"
@@ -413,6 +425,7 @@ develop = true
 [package.dependencies]
 fire = "^0.4.0"
 inflect = "^6.0.2"
+jsonpointer = "^2.3"
 jsonref = "^1.1.0"
 mysqlclient = "^2.1.1"
 psycopg2 = "^2.9.5"


Reply via email to