From 37f7346266221822a32931ff0eff994e20088177 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Mon, 29 Sep 2025 17:11:53 -0700 Subject: [PATCH 1/9] fix: remove dead code Signed-off-by: Zack Koppert --- measure_innersource.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/measure_innersource.py b/measure_innersource.py index 6d0d933..693a48b 100644 --- a/measure_innersource.py +++ b/measure_innersource.py @@ -123,8 +123,6 @@ def main(): # pragma: no cover ghe, gh_app_id, gh_app_private_key_bytes, gh_app_installation_id ) - # evaluate_markdown_file_size(output_file) - if github_connection: logger.info("Connection to GitHub successful") From 204b3f882dce270173557ff39b16bf51b62c042b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 00:16:15 +0000 Subject: [PATCH 2/9] Initial plan From 56128798cd15111b9bdb0350b7adfcdb5718457c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 00:38:04 +0000 Subject: [PATCH 3/9] Add OWNING_TEAM environment variable for custom team ownership Co-authored-by: zkoppert <6935431+zkoppert@users.noreply.github.com> --- README.md | 47 ++++++++++++++ config.py | 19 +++++- markdown_writer.py | 13 +++- measure_innersource.py | 120 ++++++++++++++++++---------------- test_config_owning_team.py | 128 +++++++++++++++++++++++++++++++++++++ 5 files changed, 269 insertions(+), 58 deletions(-) create mode 100644 test_config_owning_team.py diff --git a/README.md b/README.md index 0dd80c0..0f43a43 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,17 @@ The tool determines team boundaries using this algorithm: - Are managers of anyone in the team 4. **Classify Contributors**: Any contributor not in the team list is considered an InnerSource contributor +**Overriding Team Determination**: You can override this algorithm by setting the `OWNING_TEAM` environment variable with a comma-separated list of GitHub usernames. When set, these users will be considered the owning team, and the algorithm above will be bypassed. This is useful when: +- The first commit author doesn't accurately represent the current owning team +- The org chart doesn't align with actual repository ownership +- You want to explicitly define team boundaries + +Example: +```yaml +env: + OWNING_TEAM: "alice,bob,charlie" +``` + #### Example Team Boundary Calculation
@@ -517,6 +528,40 @@ jobs: CHUNK_SIZE: "100" ``` +#### Using Custom Team Ownership + +To override the automatic team determination and explicitly specify the owning team: + +```yaml +name: InnerSource with Custom Team + +on: + schedule: + - cron: "0 0 * * 0" + workflow_dispatch: + +jobs: + measure-innersource: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Measure InnerSource + uses: github/measure-innersource@v1 + env: + REPOSITORY: "org/repository" + GH_TOKEN: ${{ secrets.GH_TOKEN }} + OWNING_TEAM: "alice,bob,charlie,david" + REPORT_TITLE: "InnerSource Report with Custom Team" +``` + +This is useful when: +- The first commit author doesn't represent the current team +- The org chart doesn't align with actual ownership +- You want to explicitly define team boundaries + + ### Configuration Below are the allowed configuration options: @@ -550,6 +595,7 @@ This action can be configured to authenticate with GitHub App Installation or Pe | `REPORT_TITLE` | False | `"InnerSource Report"` | Title to have on the report issue. | | `REPOSITORY` | True | `""` | The name of the repository you are trying to measure. Format `owner/repo` ie. `github/measure-innersource` | | `CHUNK_SIZE` | False | `100` | Number of items to process at once when fetching data. Increasing can improve performance but uses more memory. Minimum value is 10. | +| `OWNING_TEAM` | False | `""` | Comma-separated list of GitHub usernames that own the repository. Overrides the built-in team determination algorithm. Example: `alice,bob,charlie` | ## Understanding the Results @@ -709,6 +755,7 @@ The tool automatically splits large files, but you can: - [ ] `OUTPUT_FILE` (default: "innersource_report.md") - [ ] `CHUNK_SIZE` (default: 100, minimum: 10) - [ ] `RATE_LIMIT_BYPASS` (default: false) +- [ ] `OWNING_TEAM` (comma-separated usernames to override team determination) #### File Requirements Checklist diff --git a/config.py b/config.py index 4e89356..0c818d3 100644 --- a/config.py +++ b/config.py @@ -36,6 +36,7 @@ class EnvVars: output_file (str): The name of the file to write the report to rate_limit_bypass (bool): If set to TRUE, bypass the rate limit for the GitHub API chunk_size (int): The number of items to process at once when fetching data (for memory efficiency) + owning_team (list[str] | None): Optional list of usernames that comprise the owning team (overrides algorithm) """ def __init__( @@ -52,6 +53,7 @@ def __init__( output_file: str, rate_limit_bypass: bool = False, chunk_size: int = 100, + owning_team: list[str] | None = None, ): self.gh_app_id = gh_app_id self.gh_app_installation_id = gh_app_installation_id @@ -65,6 +67,7 @@ def __init__( self.output_file = output_file self.rate_limit_bypass = rate_limit_bypass self.chunk_size = chunk_size + self.owning_team = owning_team def __repr__(self): return ( @@ -80,7 +83,8 @@ def __repr__(self): f"{self.repo}," f"{self.output_file}," f"{self.rate_limit_bypass}," - f"{self.chunk_size}" + f"{self.chunk_size}," + f"{self.owning_team}" ")" ) @@ -183,6 +187,8 @@ def get_env_vars(test: bool = False) -> EnvVars: - OUTPUT_FILE: Output filename (default: "innersource_report.md") - RATE_LIMIT_BYPASS: Set to "true" to bypass rate limiting - CHUNK_SIZE: Number of items to process at once (default: 100, minimum: 10) + - OWNING_TEAM: Comma-separated list of GitHub usernames that own the repository + (overrides the built-in team determination algorithm) Examples: >>> os.environ['GH_TOKEN'] = 'ghp_...' @@ -243,6 +249,16 @@ def get_env_vars(test: bool = False) -> EnvVars: # Default to DEFAULT_CHUNK_SIZE if not a valid integer chunk_size = DEFAULT_CHUNK_SIZE + # Get optional owning team override (comma-separated list of usernames) + owning_team_str = os.getenv("OWNING_TEAM", "").strip() + owning_team = None + if owning_team_str: + # Parse comma-separated list and strip whitespace from each username + owning_team = [username.strip() for username in owning_team_str.split(",") if username.strip()] + # If the list is empty after stripping, set to None + if not owning_team: + owning_team = None + return EnvVars( gh_app_id, gh_app_installation_id, @@ -256,4 +272,5 @@ def get_env_vars(test: bool = False) -> EnvVars: output_file, rate_limit_bypass, chunk_size, + owning_team, ) diff --git a/markdown_writer.py b/markdown_writer.py index c3cfd88..d48a0ef 100644 --- a/markdown_writer.py +++ b/markdown_writer.py @@ -138,9 +138,16 @@ def write_to_markdown( return report_file.write(f"## Repository: {repo_data.full_name}\n\n") report_file.write(f"### InnerSource Ratio: {innersource_ratio:.2%}\n\n") - report_file.write( - f"### Original Commit Author: {original_commit_author} (Manager: {original_commit_author_manager})\n\n" - ) + if original_commit_author and original_commit_author_manager: + report_file.write( + f"### Original Commit Author: {original_commit_author} (Manager: {original_commit_author_manager})\n\n" + ) + elif original_commit_author: + report_file.write( + f"### Original Commit Author: {original_commit_author}\n\n" + ) + else: + report_file.write("### Team ownership is explicitly specified\n\n") report_file.write("## Team Members that Own the Repo:\n") if team_members_that_own_the_repo: for member in team_members_that_own_the_repo: diff --git a/measure_innersource.py b/measure_innersource.py index 693a48b..8cde7be 100644 --- a/measure_innersource.py +++ b/measure_innersource.py @@ -100,6 +100,7 @@ def main(): # pragma: no cover repo = env_vars.repo report_title = env_vars.report_title output_file = env_vars.output_file + owning_team = env_vars.owning_team # rate_limit_bypass = env_vars.rate_limit_bypass ghe = env_vars.ghe @@ -163,68 +164,79 @@ def main(): # pragma: no cover innersource_contributors = [] team_members_that_own_the_repo = [] - logger.info("Analyzing first commit...") + # Get all commits for contribution counting (needed regardless of team determination method) + logger.info("Fetching commits...") commits = repo_data.commits() - # Paginate to the last page to get the oldest commit - # commits is a GitHubIterator, so you can use .count to get total, - # then get the last one commit_list = list(commits) - first_commit = commit_list[-1] # The last in the list is the oldest - original_commit_author = first_commit.author.login - # Check if original commit author exists in org chart - if original_commit_author not in org_data: - logger.warning( - "Original commit author '%s' not found in org chart. " - "Cannot determine team boundaries for InnerSource " - "measurement.", + # Check if owning team is explicitly specified + if owning_team: + logger.info("Using explicitly specified owning team: %s", owning_team) + team_members_that_own_the_repo = owning_team + # Set variables to None since they're not used when team is specified + original_commit_author = None + original_commit_author_manager = None + else: + logger.info("Analyzing first commit...") + # Paginate to the last page to get the oldest commit + # commits is a GitHubIterator, so you can use .count to get total, + # then get the last one + first_commit = commit_list[-1] # The last in the list is the oldest + original_commit_author = first_commit.author.login + + # Check if original commit author exists in org chart + if original_commit_author not in org_data: + logger.warning( + "Original commit author '%s' not found in org chart. " + "Cannot determine team boundaries for InnerSource " + "measurement.", + original_commit_author, + ) + return + + original_commit_author_manager = org_data[original_commit_author]["manager"] + logger.info( + "Original commit author: %s, with manager: %s", original_commit_author, + original_commit_author_manager, ) - return + # Create a dictionary mapping users to their managers for faster lookups + user_to_manager = {} + manager_to_reports = {} + + for user, data in org_data.items(): + manager = data["manager"] + user_to_manager[user] = manager + + # Also create reverse mapping of manager to direct reports + if manager not in manager_to_reports: + manager_to_reports[manager] = [] + manager_to_reports[manager].append(user) + + # Find all users that report up to the same manager as the original commit author + team_members_that_own_the_repo.append(original_commit_author) + team_members_that_own_the_repo.append(original_commit_author_manager) + + # Add all users reporting to the same manager + if original_commit_author_manager in manager_to_reports: + team_members_that_own_the_repo.extend( + manager_to_reports[original_commit_author_manager] + ) - original_commit_author_manager = org_data[original_commit_author]["manager"] - logger.info( - "Original commit author: %s, with manager: %s", - original_commit_author, - original_commit_author_manager, - ) - # Create a dictionary mapping users to their managers for faster lookups - user_to_manager = {} - manager_to_reports = {} - - for user, data in org_data.items(): - manager = data["manager"] - user_to_manager[user] = manager - - # Also create reverse mapping of manager to direct reports - if manager not in manager_to_reports: - manager_to_reports[manager] = [] - manager_to_reports[manager].append(user) - - # Find all users that report up to the same manager as the original commit author - team_members_that_own_the_repo.append(original_commit_author) - team_members_that_own_the_repo.append(original_commit_author_manager) - - # Add all users reporting to the same manager - if original_commit_author_manager in manager_to_reports: - team_members_that_own_the_repo.extend( - manager_to_reports[original_commit_author_manager] + # Add everyone that has one of the team members listed as their manager + for user, manager in user_to_manager.items(): + if ( + manager in team_members_that_own_the_repo + and user not in team_members_that_own_the_repo + ): + team_members_that_own_the_repo.append(user) + + # Remove duplicates from the team members list + team_members_that_own_the_repo = list(set(team_members_that_own_the_repo)) + logger.debug( + "Team members that own the repo: %s", team_members_that_own_the_repo ) - # Add everyone that has one of the team members listed as their manager - for user, manager in user_to_manager.items(): - if ( - manager in team_members_that_own_the_repo - and user not in team_members_that_own_the_repo - ): - team_members_that_own_the_repo.append(user) - - # Remove duplicates from the team members list - team_members_that_own_the_repo = list(set(team_members_that_own_the_repo)) - logger.debug( - "Team members that own the repo: %s", team_members_that_own_the_repo - ) - # For each contributor, check if they are in the team that owns the repo list # and if not, add them to the innersource contributors list logger.info("Analyzing all contributors in the repository...") diff --git a/test_config_owning_team.py b/test_config_owning_team.py new file mode 100644 index 0000000..2717ece --- /dev/null +++ b/test_config_owning_team.py @@ -0,0 +1,128 @@ +"""Unit tests for OWNING_TEAM environment variable in config module.""" + +import os +import unittest +from unittest.mock import patch + +from config import get_env_vars + + +class TestOwningTeamEnvVar(unittest.TestCase): + """Test suite for the OWNING_TEAM environment variable.""" + + def setUp(self): + """Clean up environment before each test.""" + env_keys = [ + "GH_APP_ID", + "GH_APP_INSTALLATION_ID", + "GH_APP_PRIVATE_KEY", + "GH_TOKEN", + "GH_ENTERPRISE_URL", + "OUTPUT_FILE", + "REPORT_TITLE", + "RATE_LIMIT_BYPASS", + "REPOSITORY", + "OWNING_TEAM", + ] + for key in env_keys: + if key in os.environ: + del os.environ[key] + + @patch.dict( + os.environ, + { + "GH_TOKEN": "test_token", + "REPOSITORY": "owner/repo", + "OWNING_TEAM": "alice,bob,charlie", + }, + ) + def test_owning_team_parsed_correctly(self): + """Test that OWNING_TEAM is parsed correctly as a list.""" + env_vars = get_env_vars(test=True) + self.assertIsNotNone(env_vars.owning_team) + self.assertEqual(env_vars.owning_team, ["alice", "bob", "charlie"]) + + @patch.dict( + os.environ, + { + "GH_TOKEN": "test_token", + "REPOSITORY": "owner/repo", + "OWNING_TEAM": "alice, bob, charlie", + }, + ) + def test_owning_team_with_spaces(self): + """Test that OWNING_TEAM handles spaces correctly.""" + env_vars = get_env_vars(test=True) + self.assertIsNotNone(env_vars.owning_team) + self.assertEqual(env_vars.owning_team, ["alice", "bob", "charlie"]) + + @patch.dict( + os.environ, + { + "GH_TOKEN": "test_token", + "REPOSITORY": "owner/repo", + "OWNING_TEAM": "single-user", + }, + ) + def test_owning_team_single_user(self): + """Test that OWNING_TEAM works with a single user.""" + env_vars = get_env_vars(test=True) + self.assertIsNotNone(env_vars.owning_team) + self.assertEqual(env_vars.owning_team, ["single-user"]) + + @patch.dict( + os.environ, + { + "GH_TOKEN": "test_token", + "REPOSITORY": "owner/repo", + "OWNING_TEAM": "", + }, + ) + def test_owning_team_empty_string(self): + """Test that empty OWNING_TEAM is treated as None.""" + env_vars = get_env_vars(test=True) + self.assertIsNone(env_vars.owning_team) + + @patch.dict( + os.environ, + { + "GH_TOKEN": "test_token", + "REPOSITORY": "owner/repo", + }, + ) + def test_owning_team_not_set(self): + """Test that missing OWNING_TEAM is None.""" + env_vars = get_env_vars(test=True) + self.assertIsNone(env_vars.owning_team) + + @patch.dict( + os.environ, + { + "GH_TOKEN": "test_token", + "REPOSITORY": "owner/repo", + "OWNING_TEAM": " , , ", + }, + ) + def test_owning_team_only_spaces_and_commas(self): + """Test that OWNING_TEAM with only spaces and commas is treated as None.""" + env_vars = get_env_vars(test=True) + self.assertIsNone(env_vars.owning_team) + + @patch.dict( + os.environ, + { + "GH_TOKEN": "test_token", + "REPOSITORY": "owner/repo", + "OWNING_TEAM": "alice,,bob,,,charlie", + }, + ) + def test_owning_team_with_empty_entries(self): + """Test that OWNING_TEAM handles empty entries correctly.""" + env_vars = get_env_vars(test=True) + self.assertIsNotNone(env_vars.owning_team) + # Empty entries should be filtered out + self.assertEqual(env_vars.owning_team, ["alice", "bob", "charlie"]) + + +if __name__ == "__main__": + unittest.main() From a033b83980f40c8ab265f05625565f0dcb08e7dc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 00:39:31 +0000 Subject: [PATCH 4/9] Add comprehensive tests for OWNING_TEAM feature and update .env-example Co-authored-by: zkoppert <6935431+zkoppert@users.noreply.github.com> --- .env-example | 1 + test_markdown_writer_owning_team.py | 87 +++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 test_markdown_writer_owning_team.py diff --git a/.env-example b/.env-example index c05a496..84e3169 100644 --- a/.env-example +++ b/.env-example @@ -6,3 +6,4 @@ GH_ENTERPRISE_URL = "" GH_TOKEN = "" OUTPUT_FILE = "" REPORT_TITLE = "" +OWNING_TEAM = "" diff --git a/test_markdown_writer_owning_team.py b/test_markdown_writer_owning_team.py new file mode 100644 index 0000000..16c3bf0 --- /dev/null +++ b/test_markdown_writer_owning_team.py @@ -0,0 +1,87 @@ +"""Unit tests for markdown_writer with owning_team override.""" + +from unittest.mock import MagicMock + +from markdown_writer import write_to_markdown + + +def test_write_to_markdown_with_owning_team_override(tmp_path): + """Test markdown output when owning team is explicitly specified.""" + output_file = tmp_path / "test_owning_team.md" + + # Mock repository object + mock_repo = MagicMock() + mock_repo.full_name = "org/repo" + + # Call with None for original_commit_author and manager (simulating owning_team override) + write_to_markdown( + report_title="Test Report with Owning Team", + output_file=str(output_file), + innersource_ratio=0.35, + repo_data=mock_repo, + original_commit_author=None, + original_commit_author_manager=None, + team_members_that_own_the_repo=["alice", "bob", "charlie"], + all_contributors=["alice", "bob", "charlie", "dave", "eve"], + innersource_contributors=["dave", "eve"], + innersource_contribution_counts={"dave": 15, "eve": 8}, + team_member_contribution_counts={"alice": 25, "bob": 12, "charlie": 5}, + ) + + # Verify the file was created + assert output_file.exists() + + # Read the content + content = output_file.read_text(encoding="utf-8") + + # Verify the content includes the expected sections + assert "# Test Report with Owning Team" in content + assert "## Repository: org/repo" in content + assert "### InnerSource Ratio: 35.00%" in content + assert "### Team ownership is explicitly specified" in content + # Should NOT contain original commit author and manager + assert "Original Commit Author:" not in content + assert "Manager:" not in content + # Should contain team members + assert "## Team Members that Own the Repo:" in content + assert "- alice" in content + assert "- bob" in content + assert "- charlie" in content + + +def test_write_to_markdown_with_original_author(tmp_path): + """Test markdown output when team is determined by algorithm.""" + output_file = tmp_path / "test_algorithm.md" + + # Mock repository object + mock_repo = MagicMock() + mock_repo.full_name = "org/repo" + + # Call with original_commit_author and manager (normal algorithm mode) + write_to_markdown( + report_title="Test Report with Algorithm", + output_file=str(output_file), + innersource_ratio=0.35, + repo_data=mock_repo, + original_commit_author="alice", + original_commit_author_manager="manager1", + team_members_that_own_the_repo=["alice", "bob", "charlie"], + all_contributors=["alice", "bob", "charlie", "dave", "eve"], + innersource_contributors=["dave", "eve"], + innersource_contribution_counts={"dave": 15, "eve": 8}, + team_member_contribution_counts={"alice": 25, "bob": 12, "charlie": 5}, + ) + + # Verify the file was created + assert output_file.exists() + + # Read the content + content = output_file.read_text(encoding="utf-8") + + # Verify the content includes the expected sections + assert "# Test Report with Algorithm" in content + assert "## Repository: org/repo" in content + assert "### InnerSource Ratio: 35.00%" in content + assert "### Original Commit Author: alice (Manager: manager1)" in content + # Should NOT contain the override message + assert "Team ownership is explicitly specified" not in content From 2b7be5d10fce550dd797878c9bb736b5d45967ba Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Tue, 30 Sep 2025 11:26:20 -0700 Subject: [PATCH 5/9] fix: linting fix for readability Signed-off-by: Zack Koppert --- config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/config.py b/config.py index 0c818d3..e0b0a8e 100644 --- a/config.py +++ b/config.py @@ -254,7 +254,11 @@ def get_env_vars(test: bool = False) -> EnvVars: owning_team = None if owning_team_str: # Parse comma-separated list and strip whitespace from each username - owning_team = [username.strip() for username in owning_team_str.split(",") if username.strip()] + owning_team = [ + username.strip() + for username in owning_team_str.split(",") + if username.strip() + ] # If the list is empty after stripping, set to None if not owning_team: owning_team = None From 3a45a2dfaa6a2f4ffe738adf2e8022f164d7cfce Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Tue, 30 Sep 2025 14:42:30 -0700 Subject: [PATCH 6/9] ix: resolve infinite loop in test_main_missing_user_in_org_chart This commit fixes an issue in the test where the mock commits setup was causing an infinite loop. The changes include: - Converting mock commits to a proper iterator to match real GitHub API behavior - Fixing environment variable field names to align with actual main function usage - Adding chunk_size to avoid issues in processing loops - Adding additional assertions to verify early function exit Tests that were previously hanging now complete successfully. Signed-off-by: Zack Koppert --- test_measure_innersource.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/test_measure_innersource.py b/test_measure_innersource.py index adc22e1..92c9854 100644 --- a/test_measure_innersource.py +++ b/test_measure_innersource.py @@ -99,23 +99,26 @@ def test_main_missing_user_in_org_chart(tmp_path, monkeypatch): mock_repo.full_name = "test/repo" # Mock commits to return our mock commit as the first/oldest commit - mock_repo.commits.return_value = [mock_commit] + # Create a proper iterator that will convert to a list with one item + mock_repo.commits.return_value = iter([mock_commit]) mock_github = MagicMock() mock_github.repository.return_value = mock_repo # Mock environment variables mock_env_vars = MagicMock() - mock_env_vars.github_token = "fake_token" - mock_env_vars.github_enterprise_hostname = None - mock_env_vars.github_org = "test" - mock_env_vars.github_repo = "repo" + mock_env_vars.gh_token = "fake_token" # Change to match field name in main() + mock_env_vars.ghe = None # Change to match field name in main() + mock_env_vars.owner = "test" # Change to match field name in main() + mock_env_vars.repo = "repo" # Change to match field name in main() mock_env_vars.gh_app_id = None mock_env_vars.gh_app_installation_id = None mock_env_vars.gh_app_private_key_bytes = None mock_env_vars.gh_app_enterprise_only = False mock_env_vars.report_title = "Test Report" mock_env_vars.output_file = "test_output.md" + mock_env_vars.owning_team = None # Add owning_team field that's used in main() + mock_env_vars.chunk_size = 100 # Add chunk_size to avoid issues in processing loops # Apply mocks with patch("measure_innersource.get_env_vars", return_value=mock_env_vars), patch( @@ -159,6 +162,13 @@ def test_main_missing_user_in_org_chart(tmp_path, monkeypatch): for msg in info_calls ) + # Verify that the function exited without attempting to process pull requests or issues + # by checking that "Processing pull requests" message is not in the logs + assert not any( + isinstance(msg, str) and "Processing pull requests" in msg + for msg in info_calls + ) + def test_contributors_missing_from_org_chart_excluded(tmp_path, monkeypatch): """Test that contributors missing from org chart are excluded from @@ -189,7 +199,7 @@ def test_contributors_missing_from_org_chart_excluded(tmp_path, monkeypatch): mock_repo = MagicMock() mock_repo.full_name = "test/repo" - mock_repo.commits.return_value = [mock_commit] + mock_repo.commits.return_value = iter([mock_commit]) mock_repo.contributors.return_value = [mock_contributor1] # Mock empty pull requests and issues to avoid infinite loops mock_repo.pull_requests.return_value = iter([]) @@ -200,16 +210,17 @@ def test_contributors_missing_from_org_chart_excluded(tmp_path, monkeypatch): # Mock environment variables mock_env_vars = MagicMock() - mock_env_vars.github_token = "fake_token" - mock_env_vars.github_enterprise_hostname = None - mock_env_vars.github_org = "test" - mock_env_vars.github_repo = "repo" + mock_env_vars.gh_token = "fake_token" # Change to match field name in main() + mock_env_vars.ghe = None # Change to match field name in main() + mock_env_vars.owner = "test" # Change to match field name in main() + mock_env_vars.repo = "repo" # Change to match field name in main() mock_env_vars.gh_app_id = None mock_env_vars.gh_app_installation_id = None mock_env_vars.gh_app_private_key_bytes = None mock_env_vars.gh_app_enterprise_only = False mock_env_vars.report_title = "Test Report" mock_env_vars.output_file = "test_output.md" + mock_env_vars.owning_team = None # Add owning_team field that's used in main() mock_env_vars.chunk_size = 100 # Apply mocks From ec1c43488780e9136973bc48eb78bdc7c1ea5920 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Tue, 30 Sep 2025 14:47:58 -0700 Subject: [PATCH 7/9] test: add coverage for original commit author without manager case Added test_markdown_writer_original_author_only.py to cover the specific case where an original commit author is provided without a manager, ensuring that the appropriate heading is written to the report markdown file. Signed-off-by: Zack Koppert --- test_markdown_writer_original_author_only.py | 59 ++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 test_markdown_writer_original_author_only.py diff --git a/test_markdown_writer_original_author_only.py b/test_markdown_writer_original_author_only.py new file mode 100644 index 0000000..8782993 --- /dev/null +++ b/test_markdown_writer_original_author_only.py @@ -0,0 +1,59 @@ +""" +Tests for markdown_writer.py specifically for when only original_commit_author is provided +""" + +import os +import tempfile + +from markdown_writer import write_to_markdown + + +def test_write_to_markdown_original_author_only(): + """ + Test writing markdown when only original_commit_author is provided without manager + """ + with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as temp_file: + temp_file_path = temp_file.name + + try: + # Mock repo data object + class MockRepo: + """ + Mock repository data class for testing + """ + + @property + def full_name(self): + """ + Returns the full name of the mock repository + """ + return "owner/repo" + + write_to_markdown( + report_title="Test Report", + output_file=temp_file_path, + innersource_ratio=0.5, + repo_data=MockRepo(), + original_commit_author="author", # Only provide the author, not the manager + original_commit_author_manager="", # Empty manager + team_members_that_own_the_repo=["team_member1"], + all_contributors=["contributor1"], + innersource_contributors=["contributor1"], + innersource_contribution_counts={"contributor1": 5}, + team_member_contribution_counts={"team_member1": 10}, + ) + + # Read the generated file + with open(temp_file_path, "r", encoding="utf-8") as f: + content = f.read() + + # Check that the original author is included without manager + assert "### Original Commit Author: author\n" in content + + # Make sure the manager version is not included + assert "Manager:" not in content + + finally: + # Clean up the temporary file + if os.path.exists(temp_file_path): + os.remove(temp_file_path) From 2d429f4d619fbd57a968314b7c82506bc0e67609 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Tue, 30 Sep 2025 14:54:39 -0700 Subject: [PATCH 8/9] refactor: add explicit team_ownership flag for better edge case handling This change improves the clarity and maintainability of the codebase by: - Adding an explicit `team_ownership_explicitly_specified` parameter to track when team ownership is provided explicitly rather than derived from commit history - Updating logic in markdown_writer.py to prioritize this flag when determining report headings - Adding a fallback message for cases where neither original author nor explicit team ownership is specified - Setting flag value in measure_innersource.py based on owning_team parameter - Adding comprehensive tests for the new parameter and updated behavior This change makes the relationship between owning_team and report content more explicit and handles edge cases more gracefully. Signed-off-by: Zack Koppert --- markdown_writer.py | 30 ++++- measure_innersource.py | 3 + test_markdown_writer.py | 4 +- test_markdown_writer_edge_cases.py | 2 + test_markdown_writer_more_edge_cases.py | 1 + test_markdown_writer_original_author_only.py | 1 + test_markdown_writer_owning_team.py | 4 +- test_markdown_writer_team_ownership_flag.py | 112 +++++++++++++++++++ test_markdown_writer_zero_contributions.py | 1 + 9 files changed, 153 insertions(+), 5 deletions(-) create mode 100644 test_markdown_writer_team_ownership_flag.py diff --git a/markdown_writer.py b/markdown_writer.py index d48a0ef..6143526 100644 --- a/markdown_writer.py +++ b/markdown_writer.py @@ -40,6 +40,7 @@ def write_to_markdown( innersource_contributors=None, innersource_contribution_counts=None, team_member_contribution_counts=None, + team_ownership_explicitly_specified=False, ) -> None: """ Generate a comprehensive InnerSource collaboration report in markdown format. @@ -84,6 +85,10 @@ def write_to_markdown( mapping team member usernames to their contribution counts. + team_ownership_explicitly_specified (bool, optional): Flag indicating whether team + ownership is explicitly specified + rather than derived from commit history. + Defaults to False. Returns: None: This function creates a markdown file as a side effect. @@ -126,7 +131,22 @@ def write_to_markdown( ... all_contributors=["alice", "bob", "charlie", "dave", "eve"], ... innersource_contributors=["dave", "eve"], ... innersource_contribution_counts={"dave": 15, "eve": 8}, - ... team_member_contribution_counts={"alice": 25, "bob": 12, "charlie": 5} + ... team_member_contribution_counts={"alice": 25, "bob": 12, "charlie": 5}, + ... team_ownership_explicitly_specified=False + ... ) + + >>> # Generate a report with explicitly specified team ownership + >>> write_to_markdown( + ... report_title="Team Ownership Report", + ... output_file="team_report.md", + ... innersource_ratio=0.38, + ... repo_data=repo_object, + ... team_members_that_own_the_repo=["david", "emily", "frank"], + ... all_contributors=["david", "emily", "frank", "greg", "hannah"], + ... innersource_contributors=["greg", "hannah"], + ... innersource_contribution_counts={"greg": 12, "hannah": 6}, + ... team_member_contribution_counts={"david": 18, "emily": 9, "frank": 7}, + ... team_ownership_explicitly_specified=True ... ) """ output_file_name = output_file if output_file else "innersource_report.md" @@ -138,7 +158,9 @@ def write_to_markdown( return report_file.write(f"## Repository: {repo_data.full_name}\n\n") report_file.write(f"### InnerSource Ratio: {innersource_ratio:.2%}\n\n") - if original_commit_author and original_commit_author_manager: + if team_ownership_explicitly_specified: + report_file.write("### Team ownership is explicitly specified\n\n") + elif original_commit_author and original_commit_author_manager: report_file.write( f"### Original Commit Author: {original_commit_author} (Manager: {original_commit_author_manager})\n\n" ) @@ -147,7 +169,9 @@ def write_to_markdown( f"### Original Commit Author: {original_commit_author}\n\n" ) else: - report_file.write("### Team ownership is explicitly specified\n\n") + report_file.write( + "### Original commit author information not available\n\n" + ) report_file.write("## Team Members that Own the Repo:\n") if team_members_that_own_the_repo: for member in team_members_that_own_the_repo: diff --git a/measure_innersource.py b/measure_innersource.py index 8cde7be..0ff2951 100644 --- a/measure_innersource.py +++ b/measure_innersource.py @@ -414,6 +414,9 @@ def main(): # pragma: no cover innersource_contributors=innersource_contributors, innersource_contribution_counts=innersource_contribution_counts, team_member_contribution_counts=team_member_contribution_counts, + team_ownership_explicitly_specified=bool( + owning_team + ), # True if owning_team is specified ) evaluate_markdown_file_size(output_file) diff --git a/test_markdown_writer.py b/test_markdown_writer.py index 8bc6acc..8ff20a4 100644 --- a/test_markdown_writer.py +++ b/test_markdown_writer.py @@ -29,6 +29,7 @@ def test_write_to_markdown_no_repo_data(self): write_to_markdown( report_title="InnerSource Report", output_file="innersource_report.md", + team_ownership_explicitly_specified=False, ) # Check that the function writes the correct markdown file @@ -63,6 +64,7 @@ def test_write_to_markdown_with_data(self): innersource_contributors=innersource_contributors, innersource_contribution_counts=innersource_counts, team_member_contribution_counts=team_member_counts, + team_ownership_explicitly_specified=False, ) # Check that the function writes the correct markdown file @@ -96,7 +98,7 @@ def test_write_to_markdown_default_filename(self): """Test that write_to_markdown uses the default filename when none is provided.""" # Call the function with no output_file write_to_markdown( - report_title="InnerSource Report", + report_title="InnerSource Report", team_ownership_explicitly_specified=False ) # Check that the function uses the default filename diff --git a/test_markdown_writer_edge_cases.py b/test_markdown_writer_edge_cases.py index 76905ba..e12476a 100644 --- a/test_markdown_writer_edge_cases.py +++ b/test_markdown_writer_edge_cases.py @@ -46,6 +46,7 @@ def full_name(self): innersource_contributors=["contributor1"], innersource_contribution_counts={"contributor1": 5}, team_member_contribution_counts={"team_member1": 10}, + team_ownership_explicitly_specified=False, ) # Read the generated file @@ -93,6 +94,7 @@ def full_name(self): innersource_contributors=None, # Empty innersource contributors innersource_contribution_counts={"contributor1": 5}, team_member_contribution_counts={"team_member1": 10}, + team_ownership_explicitly_specified=False, ) # Read the generated file diff --git a/test_markdown_writer_more_edge_cases.py b/test_markdown_writer_more_edge_cases.py index e2b68b5..c2d930b 100644 --- a/test_markdown_writer_more_edge_cases.py +++ b/test_markdown_writer_more_edge_cases.py @@ -46,6 +46,7 @@ def full_name(self): innersource_contributors=["contributor1"], innersource_contribution_counts=None, # Empty contribution counts team_member_contribution_counts={}, # Empty dict for team member counts + team_ownership_explicitly_specified=False, ) # Read the generated file diff --git a/test_markdown_writer_original_author_only.py b/test_markdown_writer_original_author_only.py index 8782993..e49602e 100644 --- a/test_markdown_writer_original_author_only.py +++ b/test_markdown_writer_original_author_only.py @@ -41,6 +41,7 @@ def full_name(self): innersource_contributors=["contributor1"], innersource_contribution_counts={"contributor1": 5}, team_member_contribution_counts={"team_member1": 10}, + team_ownership_explicitly_specified=False, ) # Read the generated file diff --git a/test_markdown_writer_owning_team.py b/test_markdown_writer_owning_team.py index 16c3bf0..5dfd6b1 100644 --- a/test_markdown_writer_owning_team.py +++ b/test_markdown_writer_owning_team.py @@ -13,7 +13,7 @@ def test_write_to_markdown_with_owning_team_override(tmp_path): mock_repo = MagicMock() mock_repo.full_name = "org/repo" - # Call with None for original_commit_author and manager (simulating owning_team override) + # Call with team_ownership_explicitly_specified=True write_to_markdown( report_title="Test Report with Owning Team", output_file=str(output_file), @@ -26,6 +26,7 @@ def test_write_to_markdown_with_owning_team_override(tmp_path): innersource_contributors=["dave", "eve"], innersource_contribution_counts={"dave": 15, "eve": 8}, team_member_contribution_counts={"alice": 25, "bob": 12, "charlie": 5}, + team_ownership_explicitly_specified=True, ) # Verify the file was created @@ -70,6 +71,7 @@ def test_write_to_markdown_with_original_author(tmp_path): innersource_contributors=["dave", "eve"], innersource_contribution_counts={"dave": 15, "eve": 8}, team_member_contribution_counts={"alice": 25, "bob": 12, "charlie": 5}, + team_ownership_explicitly_specified=False, ) # Verify the file was created diff --git a/test_markdown_writer_team_ownership_flag.py b/test_markdown_writer_team_ownership_flag.py new file mode 100644 index 0000000..ac12a25 --- /dev/null +++ b/test_markdown_writer_team_ownership_flag.py @@ -0,0 +1,112 @@ +""" +Tests for markdown_writer.py specifically for the team_ownership_explicitly_specified parameter +""" + +import os +import tempfile + +from markdown_writer import write_to_markdown + + +def test_write_to_markdown_missing_original_author_with_flag(): + """ + Test writing markdown with team_ownership_explicitly_specified=True + """ + with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as temp_file: + temp_file_path = temp_file.name + + try: + # Mock repo data object + class MockRepo: + """ + Mock repository data class for testing + """ + + @property + def full_name(self): + """ + Returns the full name of the mock repository + """ + return "owner/repo" + + # Test with missing original author but with explicit team ownership flag + write_to_markdown( + report_title="Test Report", + output_file=temp_file_path, + innersource_ratio=0.5, + repo_data=MockRepo(), + original_commit_author=None, # No original author + original_commit_author_manager=None, # No manager + team_members_that_own_the_repo=["team_member1"], + all_contributors=["contributor1"], + innersource_contributors=["contributor1"], + innersource_contribution_counts={"contributor1": 5}, + team_member_contribution_counts={"team_member1": 10}, + team_ownership_explicitly_specified=True, # Explicit flag + ) + + # Read the generated file + with open(temp_file_path, "r", encoding="utf-8") as f: + content = f.read() + + # Check that explicit team ownership message is included + assert "### Team ownership is explicitly specified" in content + # Make sure the "Original commit author information not available" is not included + assert "Original commit author information not available" not in content + + finally: + # Clean up the temporary file + if os.path.exists(temp_file_path): + os.remove(temp_file_path) + + +def test_write_to_markdown_missing_original_author_without_flag(): + """ + Test writing markdown with missing original_commit_author but without team_ownership_explicitly_specified + """ + with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as temp_file: + temp_file_path = temp_file.name + + try: + # Mock repo data object + class MockRepo: + """ + Mock repository data class for testing + """ + + @property + def full_name(self): + """ + Returns the full name of the mock repository + """ + return "owner/repo" + + # Test with missing original author and without explicit team ownership flag + write_to_markdown( + report_title="Test Report", + output_file=temp_file_path, + innersource_ratio=0.5, + repo_data=MockRepo(), + original_commit_author=None, # No original author + original_commit_author_manager=None, # No manager + team_members_that_own_the_repo=["team_member1"], + all_contributors=["contributor1"], + innersource_contributors=["contributor1"], + innersource_contribution_counts={"contributor1": 5}, + team_member_contribution_counts={"team_member1": 10}, + team_ownership_explicitly_specified=False, # Flag is False + ) + + # Read the generated file + with open(temp_file_path, "r", encoding="utf-8") as f: + content = f.read() + + # Check that "Original commit author information not available" is included + assert "### Original commit author information not available" in content + # Make sure the team ownership explicitly specified message is not included + assert "Team ownership is explicitly specified" not in content + + finally: + # Clean up the temporary file + if os.path.exists(temp_file_path): + os.remove(temp_file_path) diff --git a/test_markdown_writer_zero_contributions.py b/test_markdown_writer_zero_contributions.py index d348eb4..6bdf503 100644 --- a/test_markdown_writer_zero_contributions.py +++ b/test_markdown_writer_zero_contributions.py @@ -49,6 +49,7 @@ def full_name(self): "team_member1": 0, "team_member2": 0, }, # All zero counts + team_ownership_explicitly_specified=False, ) # Read the generated file From 0bd5cc4340b98c6a4f9d162e4e0cedd5e942f6b4 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Tue, 30 Sep 2025 15:28:31 -0700 Subject: [PATCH 9/9] perf: refactor commit processing to use chunking for better memory efficiency This change improves memory usage and scalability for repositories with large commit histories by: - Replacing list(commits) with chunked processing to avoid loading the entire commit history into memory at once - Using iterators directly to find the original commit author instead of loading all commits first - Processing commits in manageable chunks based on the configured chunk_size, similar to how pull requests and issues are already handled - Adding debug logging for commit processing progress This approach ensures consistent memory usage regardless of repository size and prevents potential out-of-memory issues when analyzing repositories with extensive commit histories. Signed-off-by: Zack Koppert --- measure_innersource.py | 68 +++++++++++++++++++++++++++---------- test_measure_innersource.py | 11 +++--- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/measure_innersource.py b/measure_innersource.py index 0ff2951..15ea113 100644 --- a/measure_innersource.py +++ b/measure_innersource.py @@ -164,11 +164,6 @@ def main(): # pragma: no cover innersource_contributors = [] team_members_that_own_the_repo = [] - # Get all commits for contribution counting (needed regardless of team determination method) - logger.info("Fetching commits...") - commits = repo_data.commits() - commit_list = list(commits) - # Check if owning team is explicitly specified if owning_team: logger.info("Using explicitly specified owning team: %s", owning_team) @@ -177,12 +172,26 @@ def main(): # pragma: no cover original_commit_author = None original_commit_author_manager = None else: - logger.info("Analyzing first commit...") - # Paginate to the last page to get the oldest commit - # commits is a GitHubIterator, so you can use .count to get total, - # then get the last one - first_commit = commit_list[-1] # The last in the list is the oldest - original_commit_author = first_commit.author.login + logger.info("Finding original commit author...") + # We need to find the oldest commit for team determination + # Use GitHub's default chronological ordering (oldest first) + commits_iterator = repo_data.commits() + original_commit = None + + # Process just enough commits to find the oldest one + # Most repos will only need a single API call since GitHub sorts oldest first + # For repositories with unusual commit ordering, we'll get the first commit from the first page + try: + original_commit = next(commits_iterator) + original_commit_author = ( + original_commit.author.login + if hasattr(original_commit.author, "login") + else None + ) + logger.info("Found original commit by %s", original_commit_author) + except StopIteration: + logger.warning("No commits found in repository") + original_commit_author = None # Check if original commit author exists in org chart if original_commit_author not in org_data: @@ -267,13 +276,38 @@ def main(): # pragma: no cover logger.info("Pre-processing contribution data...") - # Create mapping of commit authors to commit counts - logger.info("Processing commits...") + # Process commits in chunks + logger.info("Processing commits in chunks...") commit_author_counts = {} - for commit in commit_list: - if hasattr(commit.author, "login"): - author = commit.author.login - commit_author_counts[author] = commit_author_counts.get(author, 0) + 1 + total_commits = 0 + + # GitHub API returns an iterator that internally handles pagination + # We'll manually chunk it to avoid loading everything at once + commits_iterator = repo_data.commits() + while True: + # Process a chunk of commits + chunk = [] + for _ in range(chunk_size): + try: + chunk.append(next(commits_iterator)) + except StopIteration: + break + + if not chunk: + break + + # Update counts for this chunk + for commit in chunk: + if hasattr(commit.author, "login"): + author = commit.author.login + commit_author_counts[author] = ( + commit_author_counts.get(author, 0) + 1 + ) + + total_commits += len(chunk) + logger.debug(" Processed %s commits so far...", total_commits) + + logger.info("Found and processed %s commits", total_commits) # Process pull requests in chunks logger.info("Processing pull requests in chunks...") diff --git a/test_measure_innersource.py b/test_measure_innersource.py index 92c9854..5f4f60c 100644 --- a/test_measure_innersource.py +++ b/test_measure_innersource.py @@ -147,13 +147,12 @@ def test_main_missing_user_in_org_chart(tmp_path, monkeypatch): call[0][0] for call in mock_logger.info.call_args_list if call[0] ] - # Should have logged about reading org data and analyzing first - # commit, but should NOT have logged about original commit author - # with manager + # Should have logged about reading org data and finding original commit author + # but should NOT have logged about original commit author with manager assert "Reading in org data from org-data.json..." in info_calls - assert "Analyzing first commit..." in info_calls - - # Should NOT contain the log message about + assert ( + "Finding original commit author..." in info_calls + ) # Should NOT contain the log message about # "Original commit author: X, with manager: Y" assert not any( isinstance(msg, str)