Skip to content

Commit dc13186

Browse files
authored
analytics: send remote types and git-remote's path hashes (#10816)
1 parent ba1281a commit dc13186

File tree

4 files changed

+135
-25
lines changed

4 files changed

+135
-25
lines changed

dvc/analytics.py

Lines changed: 71 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
import json
22
import os
3+
from typing import TYPE_CHECKING, Optional
34

45
from dvc.log import logger
56

67
from .env import DVC_ANALYTICS_ENDPOINT, DVC_NO_ANALYTICS
78

9+
if TYPE_CHECKING:
10+
from dvc.scm import Base
11+
812
logger = logger.getChild(__name__)
913

1014

@@ -89,20 +93,62 @@ def send(path):
8993
os.remove(path)
9094

9195

92-
def _scm_in_use():
93-
from dvc.exceptions import NotDvcRepoError
94-
from dvc.repo import Repo
95-
from dvc.scm import NoSCM
96+
def _git_remote_url(scm: Optional["Base"]) -> Optional[str]:
97+
from dvc.scm import Git
98+
99+
if not isinstance(scm, Git):
100+
return None
96101

97-
from .scm import SCM, SCMError
102+
from dulwich.porcelain import get_remote_repo
98103

104+
dulwich_repo = scm.dulwich.repo
99105
try:
100-
scm = SCM(root_dir=Repo.find_root())
101-
return type(scm).__name__
102-
except SCMError:
103-
return NoSCM.__name__
104-
except NotDvcRepoError:
105-
pass
106+
_remote, url = get_remote_repo(dulwich_repo)
107+
except IndexError:
108+
# IndexError happens when the head is detached
109+
_remote, url = get_remote_repo(dulwich_repo, b"origin")
110+
# Dulwich returns (None, "origin") if no remote set
111+
if (_remote, url) == (None, "origin"):
112+
return None
113+
return url
114+
115+
116+
def _scm_in_use(scm: Optional["Base"]) -> Optional[str]:
117+
return type(scm).__name__ if scm else None
118+
119+
120+
def _parse_git_remote_path(remote_url: str) -> str:
121+
from urllib.parse import urlparse
122+
123+
from scmrepo.urls import is_scp_style_url
124+
125+
parsed = urlparse(remote_url)
126+
# Windows Path also gets parsed with a drive letter as scheme
127+
# https://github.com/python/cpython/issues/86381
128+
if parsed.scheme and parsed.scheme in ("http", "https", "git", "ssh"):
129+
return parsed.path.strip("/")
130+
131+
if is_scp_style_url(remote_url):
132+
# handle scp-style URL
133+
parts = remote_url.split(":", 1)
134+
if len(parts) == 2:
135+
_, path = parts
136+
return path.rstrip("/")
137+
return remote_url
138+
139+
140+
def _git_remote_path_hash(scm: Optional["Base"]) -> Optional[str]:
141+
"""Return a hash of the git remote path."""
142+
import hashlib
143+
144+
try:
145+
if remote_url := _git_remote_url(scm):
146+
path = _parse_git_remote_path(remote_url)
147+
h = hashlib.md5(path.encode("utf-8"), usedforsecurity=False) # for FIPS
148+
return h.hexdigest()
149+
except Exception: # noqa: BLE001
150+
logger.debug("Failed to get git remote path", exc_info=True)
151+
return None
106152

107153

108154
def _runtime_info():
@@ -112,6 +158,8 @@ def _runtime_info():
112158
from iterative_telemetry import _generate_ci_id, find_or_create_user_id
113159

114160
from dvc import __version__
161+
from dvc.info import _get_remotes
162+
from dvc.repo import Repo
115163
from dvc.utils import is_binary
116164

117165
ci_id = _generate_ci_id()
@@ -120,13 +168,24 @@ def _runtime_info():
120168
else:
121169
group_id, user_id = None, find_or_create_user_id()
122170

171+
scm = None
172+
remotes = None
173+
try:
174+
repo = Repo()
175+
scm = repo.scm
176+
remotes = _get_remotes(repo.config)
177+
except Exception as exc: # noqa: BLE001
178+
logger.debug("failed to open repo: %s", exc)
179+
123180
return {
124181
"dvc_version": __version__,
125182
"is_binary": is_binary(),
126-
"scm_class": _scm_in_use(),
183+
"scm_class": _scm_in_use(scm),
127184
"system_info": _system_info(),
128185
"user_id": user_id,
129186
"group_id": group_id,
187+
"remotes": remotes,
188+
"git_remote_hash": _git_remote_path_hash(scm),
130189
}
131190

132191

dvc/info.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,10 @@ def get_dvc_info():
4848
info.append("Cache types: " + error_link("no-dvc-cache"))
4949

5050
info.append(f"Caches: {_get_caches(repo.cache)}")
51-
info.append(f"Remotes: {_get_remotes(repo.config)}")
51+
52+
configured_remotes = _get_remotes(repo.config)
53+
remotes = ", ".join(configured_remotes) if configured_remotes else None
54+
info.append(f"Remotes: {remotes}")
5255

5356
root_directory = repo.root_dir
5457
fs_root = _get_fs_type(os.path.abspath(root_directory))
@@ -75,12 +78,10 @@ def _get_caches(cache):
7578

7679

7780
def _get_remotes(config):
78-
schemes = (
81+
return [
7982
get_fs_cls(get_fs_config(config, name=remote)).protocol
8083
for remote in config["remote"]
81-
)
82-
83-
return ", ".join(schemes) or "None"
84+
]
8485

8586

8687
def _get_linktype_support_info(repo):

tests/func/test_analytics.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import pytest
44

5+
from dvc import __version__, env
56
from dvc.analytics import _scm_in_use, collect_and_send_report
67
from dvc.cli import main
78
from dvc.repo import Repo
@@ -33,27 +34,37 @@ def func(argv):
3334
return mocker.patch("dvc.daemon.daemon", mocker.MagicMock(side_effect=func))
3435

3536

36-
def test_collect_and_send_report(mocker, dvc, mock_daemon):
37+
def test_collect_and_send_report(monkeypatch, mocker, dvc, mock_daemon):
38+
monkeypatch.delenv(env.DVC_ANALYTICS_ENDPOINT, raising=False)
3739
mock_post = mocker.patch("requests.post")
3840
collect_and_send_report()
3941

4042
assert mock_daemon.call_count == 1
4143
assert mock_post.call_count == 1
4244
assert mock_post.call_args == mocker.call(
4345
"https://analytics.dvc.org",
44-
json=ANY(dict),
46+
json={
47+
"dvc_version": __version__,
48+
"scm_class": type(dvc.scm).__name__,
49+
"is_binary": False,
50+
"system_info": ANY(dict),
51+
"user_id": ANY(str),
52+
"group_id": mocker.ANY,
53+
"remotes": ANY(list),
54+
"git_remote_hash": None,
55+
},
4556
headers={"content-type": "application/json"},
4657
timeout=5,
4758
)
4859

4960

5061
def test_scm_dvc_only(tmp_dir, dvc):
51-
scm = _scm_in_use()
62+
scm = _scm_in_use(dvc.scm)
5263
assert scm == "NoSCM"
5364

5465

5566
def test_scm_git(tmp_dir, scm, dvc):
56-
scm = _scm_in_use()
67+
scm = _scm_in_use(scm)
5768
assert scm == "Git"
5869

5970

@@ -62,7 +73,7 @@ def test_scm_subrepo(tmp_dir, scm):
6273
subdir.mkdir()
6374

6475
with subdir.chdir():
65-
Repo.init(subdir=True)
66-
scm = _scm_in_use()
76+
repo = Repo.init(subdir=True)
77+
scm = _scm_in_use(repo.scm)
6778

6879
assert scm == "Git"

tests/unit/test_analytics.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1+
import hashlib
12
import json
23
import platform
34

45
import pytest
56
from voluptuous import Any, Schema
67

7-
from dvc import analytics
8+
from dvc import analytics, env
89
from dvc.cli import parse_args
910

1011

@@ -50,14 +51,17 @@ def test_runtime_info(tmp_global_dir):
5051
"user_id": str,
5152
"system_info": dict,
5253
"group_id": Any(str, None),
54+
"remotes": Any(list, None),
55+
"git_remote_hash": Any(str, None),
5356
},
5457
required=True,
5558
)
5659

5760
assert schema(analytics._runtime_info())
5861

5962

60-
def test_send(mocker, tmp_path):
63+
def test_send(monkeypatch, mocker, tmp_path):
64+
monkeypatch.delenv(env.DVC_ANALYTICS_ENDPOINT, raising=False)
6165
mock_post = mocker.patch("requests.post")
6266

6367
import requests
@@ -158,3 +162,38 @@ def test_system_info():
158162
)
159163

160164
assert schema(analytics._system_info())
165+
166+
167+
@pytest.mark.parametrize(
168+
"git_remote",
169+
[
170+
"git://github.com/iterative/dvc.git",
171+
"[email protected]:iterative/dvc.git",
172+
"http://github.com/iterative/dvc.git",
173+
"https://github.com/iterative/dvc.git",
174+
"ssh://[email protected]/iterative/dvc.git",
175+
],
176+
)
177+
def test_git_remote_hash(mocker, git_remote):
178+
m = mocker.patch("dvc.analytics._git_remote_url", return_value=git_remote)
179+
expected = hashlib.md5(b"iterative/dvc.git").hexdigest()
180+
181+
assert analytics._git_remote_path_hash(None) == expected
182+
m.assert_called_once_with(None)
183+
184+
185+
@pytest.mark.parametrize(
186+
"git_remote",
187+
[
188+
"C:\\Users\\user\\dvc.git",
189+
"/home/user/dvc.git",
190+
"file:///home/user/dvc.git",
191+
"./dvc.git",
192+
],
193+
)
194+
def test_git_remote_hash_local(mocker, git_remote):
195+
m = mocker.patch("dvc.analytics._git_remote_url", return_value=git_remote)
196+
197+
expected = hashlib.md5(git_remote.encode("utf-8")).hexdigest()
198+
assert analytics._git_remote_path_hash(None) == expected
199+
m.assert_called_once_with(None)

0 commit comments

Comments
 (0)