Skip to content

Commit

Permalink
[#23846] Support for automatically syncing inline third-party depende…
Browse files Browse the repository at this point in the history
…ncies

Summary:
Update thirdparty_tool to allow automatically syncing up the inline (currently header-only)
third-party dependencies stored in src/inline-thirdparty. We only copy separate header
subdirectories of those upstream repositories into our codebase. This light-weight process avoids
going through the regular third-party dependency build and release loop.

The inline third-party dependencies are described by the new configuration file
build-support/inline_thirdparty.yml.

Also refactoring thirdparty_tool into separate modules.

Test Plan:
bin/run_codecheck

bin/thirdparty_tool --sync-inline-thirdparty

Regression testing of thirdparty_tool:
bin/thirdparty_tool --update

Jenkins: compile only

Reviewers: steve.varnau

Reviewed By: steve.varnau

Subscribers: ybase

Differential Revision: https://phorge.dev.yugabyte.com/D37916
  • Loading branch information
mbautin committed Sep 13, 2024
1 parent 72c91c4 commit e9ec9e2
Show file tree
Hide file tree
Showing 10 changed files with 1,308 additions and 838 deletions.
39 changes: 39 additions & 0 deletions build-support/inline_thirdparty.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) YugabyteDB, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations
# under the License.
#

# See the README.md file in src/inline-thirdparty for more information.

dependencies:
- name: usearch
git_url: https://github.com/unum-cloud/usearch
commit: 4fbb56e02aa928a011abdedb66adfef128123e5f
src_dir: include
dest_dir: usearch

- name: fp16
git_url: https://github.com/Maratyszcza/FP16/
commit: 98b0a46bce017382a6351a19577ec43a715b6835
src_dir: include
dest_dir: fp16

- name: hnswlib
git_url: https://github.com/nmslib/hnswlib
commit: 2142dc6f4dd08e64ab727a7bbd93be7f732e80b0
src_dir: hnswlib
dest_dir: hnswlib/hnswlib

- name: simsimd
git_url: https://github.com/ashvardanian/simsimd
src_dir: include
dest_dir: simsimd
tag: v5.1.0
4 changes: 4 additions & 0 deletions python/yugabyte/file_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ def read_file(file_path: Union[str, pathlib.Path]) -> str:

def write_file(
content: Union[str, List[str]], output_file_path: Union[str, pathlib.Path]) -> None:
if '\n' in str(output_file_path):
raise ValueError(
"Output file path cannot contain newlines. It is possible that file content and path "
f"were reversed accidentally. Content: {content}, output_file_path: {output_file_path}")
if isinstance(content, list):
content = '\n'.join(content) + '\n'
with open(path_to_str(output_file_path), 'w') as output_file:
Expand Down
67 changes: 67 additions & 0 deletions python/yugabyte/git_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,74 @@
# or implied. See the License for the specific language governing permissions and limitations
# under the License.

import logging
import os
import re
import subprocess

from typing import Optional

from yugabyte.file_util import read_file


SHA1_RE = re.compile(r'^[0-9a-f]{40}$')


def is_valid_git_sha(commit: str) -> bool:
return SHA1_RE.match(commit) is not None


def validate_git_commit(commit: str) -> str:
commit = commit.strip().lower()
if not is_valid_git_sha(commit):
raise ValueError(f"Invalid Git commit SHA1: {commit}")
return commit


def get_github_token(token_file_path: Optional[str]) -> Optional[str]:
github_token: Optional[str]
if token_file_path:
logging.info("Reading GitHub token from %s", token_file_path)
github_token = read_file(token_file_path).strip()
else:
github_token = os.getenv('GITHUB_TOKEN')
if github_token is None:
return github_token

if len(github_token) != 40:
raise ValueError(f"Invalid GitHub token length: {len(github_token)}, expected 40.")
return github_token


def is_git_clean(repo_dir: str) -> bool:
# Check for uncommitted changes (staged or unstaged)
result = subprocess.run(['git', 'status', '--porcelain'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
cwd=repo_dir,
check=True)

# If the result is an empty string, the working directory is clean
return result.stdout.strip() == ''


def get_latest_commit_in_subdir(repo_dir: str, subdir: str) -> str:
"""
Get the latest commit that affected a particular subdirectory.
"""
assert not os.path.isabs(subdir), \
f"Subdirectory must be a relative path, not an absolute path: {subdir}"
result = subprocess.run(
['git', 'log', '-n', '1', '--pretty=format:%H', '--', subdir],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
cwd=repo_dir,
check=True
)
commit_sha = result.stdout.strip()
if not commit_sha:
raise ValueError(f"No commits found for subdirectory: {subdir}")
validate_git_commit(commit_sha)
return commit_sha
246 changes: 246 additions & 0 deletions python/yugabyte/inline_thirdparty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
# Copyright (c) YugabyteDB, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations
# under the License.
#

# Manages header-only third-party dependencies in src/inline-thirdparty. These dependencies are
# copied from the relevant subdirectories of upstream repositories and typically represent only a
# small portion of the upstream repository.

import logging
import os
import shutil
import subprocess
import tempfile

from pathlib import Path
from typing import Optional, List, Set
from dataclasses import dataclass

import ruamel.yaml

from yugabyte import common_util, file_util, git_util


INLINE_THIRDPARTY_CONFIG_REL_PATH = 'build-support/inline_thirdparty.yml'
INLINE_THIRDPARTY_CONFIG_PATH = os.path.join(
common_util.YB_SRC_ROOT, INLINE_THIRDPARTY_CONFIG_REL_PATH)
INLINE_THIRDPARTY_SRC_DIR = os.path.join(common_util.YB_SRC_ROOT, 'src', 'inline-thirdparty')

FILE_EXTENSIONS_SUPPORTING_CPP_COMMENTS = ('.c', '.cc', '.cpp', '.h', '.hpp', '.modulemap')


ruamel_yaml_object = ruamel.yaml.YAML()


@dataclass
class InlineDependency:
name: str
git_url: str
src_dir: str
dest_dir: str
tag: Optional[str] = None
commit: Optional[str] = None

def validate_tag_or_commit_choice(self) -> None:
if self.tag and self.commit:
raise ValueError(f"Only one of tag or commit can be specified: {self}")
if not self.tag and not self.commit:
raise ValueError(f"One of tag or commit must be specified: {self}")

@property
def tag_or_commit_description(self) -> str:
self.validate_tag_or_commit_choice()
if self.tag:
return f"tag {self.tag}"
if self.commit:
return f"commit {self.commit}"
raise ValueError(f"Should not happen: {self}")

@property
def tag_or_commit(self) -> str:
self.validate_tag_or_commit_choice()
result = self.tag or self.commit
assert result is not None
return result

def get_github_commits_url(self, resolved_commit: str) -> str:
return self.git_url + '/commits/' + resolved_commit


@dataclass
class DependenciesConfig:
dependencies: List[InlineDependency]


def read_yaml(file_path: str) -> DependenciesConfig:
"""Reads the YAML file and maps it to DependenciesConfig."""
with open(file_path) as file:
data = ruamel_yaml_object.load(file)
dependencies = [InlineDependency(**dep) for dep in data['dependencies']]
return DependenciesConfig(dependencies=dependencies)


def get_latest_commit_explanation(
dep: InlineDependency,
latest_commit_in_subdir: str,
cpp_comment: bool = False) -> str:
return (
f"Latest commit in the {dep.src_dir} subdirectory of the {dep.name} repository:\n" +
("// " if cpp_comment else "") +
latest_commit_in_subdir
)


def add_comment_to_file(
file_path: str,
dep: InlineDependency,
latest_commit_in_subdir: str) -> None:
"""Adds a comment to the include file indicating what version of the dependcy is being used."""
if not file_path.endswith(FILE_EXTENSIONS_SUPPORTING_CPP_COMMENTS):
logging.info("Cannot add comment to file %s", file_path)
return
if os.path.islink(file_path):
logging.info("Cannot add comment to symlink %s", file_path)
return

content = file_util.read_file(file_path)
comment = "\n".join([
f"// This file is part of the {dep.name} inline third-party dependency of YugabyteDB.",
f"// Git repo: {dep.git_url}",
f"// Git tag: {dep.tag}" if dep.tag else f"// Git commit: {dep.commit}",
f"// {get_latest_commit_explanation(dep, latest_commit_in_subdir, cpp_comment=True)}",
"//",
"// See also src/inline-thirdparty/README.md.",
])
file_util.write_file(comment + '\n\n' + content, file_path)


def validate_dir(dep: InlineDependency, dir_type: str) -> None:
dir_value = getattr(dep, dir_type)
if not dir_value:
raise ValueError(f"{dir_type} is required for {dep.name}")

if os.path.isabs(dir_value):
raise ValueError(f"{dir_type} must be a relative path for {dep.name}")

if not dep.git_url.startswith('https://github.com/'):
raise ValueError(f"git_url must be a GitHub URL for {dep.name}")


def validate_config(config: DependenciesConfig) -> None:
"""Validates the config."""
names_seen: Set[str] = set()
for dep in config.dependencies:
if dep.name in names_seen:
raise ValueError(f"Duplicate name {dep.name}")
names_seen.add(dep.name)

if not dep.git_url:
raise ValueError(f"git_url is required for {dep.name}")

validate_dir(dep, "src_dir")
validate_dir(dep, "dest_dir")
if dep.dest_dir != dep.name and not dep.dest_dir.startswith(dep.name + '/'):
raise ValueError(
f"dest_str must be the same as dependency name or have the dependency name as "
f"its first relative path component for {dep.name}: {dep}")
dep.validate_tag_or_commit_choice()


def clone_and_copy_subtrees(dependencies: List[InlineDependency]) -> None:
"""Clones repositories into a temporary directory and copies the subtrees."""
src_root = Path(INLINE_THIRDPARTY_SRC_DIR)

with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)

for dep in dependencies:
repo_dir = temp_path / dep.name
logging.info(f"Cloning {dep.name} into {repo_dir}")

# Clone the repository into the temp directory
subprocess.check_call(['git', 'clone', dep.git_url, str(repo_dir)])

# Checkout the specified tag or commit
subprocess.check_call(['git', 'checkout', dep.tag_or_commit], cwd=repo_dir)

resolved_commit = git_util.get_latest_commit_in_subdir(str(repo_dir), subdir='.')
if dep.commit and resolved_commit != dep.commit:
raise ValueError(
f"Expected resolved commit {resolved_commit} to match configured commit "
f"{dep.commit} for dependency {dep.name}")

# Define source and destination directories
src_subtree = repo_dir / dep.src_dir
dest_subtree = src_root / dep.dest_dir

logging.info("Copying subtree from {} to {}".format(src_subtree, dest_subtree))

# Ensure the destination directory exists
dest_subtree.parent.mkdir(parents=True, exist_ok=True)

# Remove the current content in the destination directory. We remove the entire
# top-level directory under inline-thirdparty, even though dest_dir could contain
# multiple path components.
subtree_to_remove = src_root / Path(dep.dest_dir).parts[0]
if subtree_to_remove.exists():
logging.info(f"Deleting existing directory {subtree_to_remove}")
shutil.rmtree(subtree_to_remove)

# Copy the subtree to the destination directory
shutil.copytree(src_subtree, dest_subtree)

latest_commit_in_subdir = git_util.get_latest_commit_in_subdir(
str(repo_dir), dep.src_dir)
for root, dirs_unused, files in os.walk(dest_subtree):
for file in files:
file_path = os.path.join(root, file)
add_comment_to_file(file_path, dep, latest_commit_in_subdir)

# Commit the changes in the current repository
make_commit(dep, latest_commit_in_subdir, resolved_commit)


def make_commit(
dep: InlineDependency, latest_commit_in_subdir: str, resolved_commit: str) -> None:
"""Creates a descriptive commit in the main YugabyteDB repo for the updated dependency."""
git_util.validate_git_commit(latest_commit_in_subdir)
git_util.validate_git_commit(resolved_commit)

if git_util.is_git_clean(common_util.YB_SRC_ROOT):
logging.info(f"No changes were made to the {dep.name} dependency, nothing to commit.")
return

commit_message_lines = [
"Automatic commit by thirdparty_tool: " +
f"update {dep.name} to {dep.tag_or_commit_description}.",
"",
f"Used commit of the {dep.name} repository: {dep.get_github_commits_url(resolved_commit)}",
]
if latest_commit_in_subdir != resolved_commit:
commit_message_lines.extend([
"",
get_latest_commit_explanation(dep, latest_commit_in_subdir)
])

commit_message = "\n".join(commit_message_lines)
subprocess.check_call(['git', 'add', '.'], cwd=INLINE_THIRDPARTY_SRC_DIR)
subprocess.check_call(['git', 'commit', '-m', commit_message], cwd=INLINE_THIRDPARTY_SRC_DIR)
logging.info(f"Created an automatic commit for {dep.name}")


def sync_inline_thirdparty() -> None:
config = read_yaml(INLINE_THIRDPARTY_CONFIG_PATH)
validate_config(config)
if not git_util.is_git_clean(common_util.YB_SRC_ROOT):
raise RuntimeError(f"Local changes exist, cannot update inline third-party dependencies.")
clone_and_copy_subtrees(config.dependencies)
15 changes: 14 additions & 1 deletion python/yugabyte/string_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# under the License.

import hashlib
from typing import Union
from typing import Union, Optional, Any


def encode_if_needed(s: Union[bytes, str]) -> bytes:
Expand All @@ -22,3 +22,16 @@ def encode_if_needed(s: Union[bytes, str]) -> bytes:

def compute_sha256(s: Union[bytes, str]) -> str:
return hashlib.sha256(encode_if_needed(s)).hexdigest()


def none_to_empty_string(x: Optional[Any]) -> Any:
if x is None:
return ''
return x


def matches_maybe_empty(a: Optional[str], b: Optional[str]) -> bool:
"""
Returns True if a or b are equal, but treating all None values as empty strings.
"""
return (a or '') == (b or '')
Loading

0 comments on commit e9ec9e2

Please sign in to comment.