Skip to content

Commit c1f2db3

Browse files
authored
Use orjson instead of json, when available (#17955)
For `mypy -c 'import torch'`, the cache load time goes from 0.44s to 0.25s as measured by manager's data_json_load_time. If I time dump times specifically, I see a saving of 0.65s to 0.07s. Overall, a pretty reasonable perf win -- should we make it a required dependency? See also #3456
1 parent f63fdb3 commit c1f2db3

File tree

6 files changed

+81
-68
lines changed

6 files changed

+81
-68
lines changed

misc/apply-cache-diff.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@
88
from __future__ import annotations
99

1010
import argparse
11-
import json
1211
import os
1312
import sys
1413

1514
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
1615

1716
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
17+
from mypy.util import json_dumps, json_loads
1818

1919

2020
def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
@@ -26,21 +26,21 @@ def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
2626

2727
def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
2828
cache = make_cache(cache_dir, sqlite)
29-
with open(diff_file) as f:
30-
diff = json.load(f)
29+
with open(diff_file, "rb") as f:
30+
diff = json_loads(f.read())
3131

32-
old_deps = json.loads(cache.read("@deps.meta.json"))
32+
old_deps = json_loads(cache.read("@deps.meta.json"))
3333

3434
for file, data in diff.items():
3535
if data is None:
3636
cache.remove(file)
3737
else:
3838
cache.write(file, data)
3939
if file.endswith(".meta.json") and "@deps" not in file:
40-
meta = json.loads(data)
40+
meta = json_loads(data)
4141
old_deps["snapshot"][meta["id"]] = meta["hash"]
4242

43-
cache.write("@deps.meta.json", json.dumps(old_deps))
43+
cache.write("@deps.meta.json", json_dumps(old_deps))
4444

4545
cache.commit()
4646

misc/diff-cache.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from __future__ import annotations
99

1010
import argparse
11-
import json
1211
import os
1312
import sys
1413
from collections import defaultdict
@@ -17,6 +16,7 @@
1716
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
1817

1918
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
19+
from mypy.util import json_dumps, json_loads
2020

2121

2222
def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
@@ -33,7 +33,7 @@ def merge_deps(all: dict[str, set[str]], new: dict[str, set[str]]) -> None:
3333

3434
def load(cache: MetadataStore, s: str) -> Any:
3535
data = cache.read(s)
36-
obj = json.loads(data)
36+
obj = json_loads(data)
3737
if s.endswith(".meta.json"):
3838
# For meta files, zero out the mtimes and sort the
3939
# dependencies to avoid spurious conflicts
@@ -73,7 +73,7 @@ def main() -> None:
7373
type_misses: dict[str, int] = defaultdict(int)
7474
type_hits: dict[str, int] = defaultdict(int)
7575

76-
updates: dict[str, str | None] = {}
76+
updates: dict[str, bytes | None] = {}
7777

7878
deps1: dict[str, set[str]] = {}
7979
deps2: dict[str, set[str]] = {}
@@ -96,7 +96,7 @@ def main() -> None:
9696
# so we can produce a much smaller direct diff of them.
9797
if ".deps." not in s:
9898
if obj2 is not None:
99-
updates[s] = json.dumps(obj2)
99+
updates[s] = json_dumps(obj2)
100100
else:
101101
updates[s] = None
102102
elif obj2:
@@ -122,7 +122,7 @@ def main() -> None:
122122
merge_deps(new_deps, root_deps)
123123

124124
new_deps_json = {k: list(v) for k, v in new_deps.items() if v}
125-
updates["@root.deps.json"] = json.dumps(new_deps_json)
125+
updates["@root.deps.json"] = json_dumps(new_deps_json)
126126

127127
# Drop updates to deps.meta.json for size reasons. The diff
128128
# applier will manually fix it up.
@@ -136,8 +136,8 @@ def main() -> None:
136136
print("hits", type_hits)
137137
print("misses", type_misses)
138138

139-
with open(args.output, "w") as f:
140-
json.dump(updates, f)
139+
with open(args.output, "wb") as f:
140+
f.write(json_dumps(updates))
141141

142142

143143
if __name__ == "__main__":

mypy/build.py

+18-27
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
from mypy.stubinfo import legacy_bundled_packages, non_bundled_packages, stub_distribution_name
9696
from mypy.types import Type
9797
from mypy.typestate import reset_global_state, type_state
98+
from mypy.util import json_dumps, json_loads
9899
from mypy.version import __version__
99100

100101
# Switch to True to produce debug output related to fine-grained incremental
@@ -858,7 +859,7 @@ def load_fine_grained_deps(self, id: str) -> dict[str, set[str]]:
858859
t0 = time.time()
859860
if id in self.fg_deps_meta:
860861
# TODO: Assert deps file wasn't changed.
861-
deps = json.loads(self.metastore.read(self.fg_deps_meta[id]["path"]))
862+
deps = json_loads(self.metastore.read(self.fg_deps_meta[id]["path"]))
862863
else:
863864
deps = {}
864865
val = {k: set(v) for k, v in deps.items()}
@@ -911,8 +912,8 @@ def stats_summary(self) -> Mapping[str, object]:
911912
return self.stats
912913

913914

914-
def deps_to_json(x: dict[str, set[str]]) -> str:
915-
return json.dumps({k: list(v) for k, v in x.items()}, separators=(",", ":"))
915+
def deps_to_json(x: dict[str, set[str]]) -> bytes:
916+
return json_dumps({k: list(v) for k, v in x.items()})
916917

917918

918919
# File for storing metadata about all the fine-grained dependency caches
@@ -980,7 +981,7 @@ def write_deps_cache(
980981

981982
meta = {"snapshot": meta_snapshot, "deps_meta": fg_deps_meta}
982983

983-
if not metastore.write(DEPS_META_FILE, json.dumps(meta, separators=(",", ":"))):
984+
if not metastore.write(DEPS_META_FILE, json_dumps(meta)):
984985
manager.log(f"Error writing fine-grained deps meta JSON file {DEPS_META_FILE}")
985986
error = True
986987

@@ -1048,7 +1049,7 @@ def generate_deps_for_cache(manager: BuildManager, graph: Graph) -> dict[str, di
10481049

10491050
def write_plugins_snapshot(manager: BuildManager) -> None:
10501051
"""Write snapshot of versions and hashes of currently active plugins."""
1051-
snapshot = json.dumps(manager.plugins_snapshot, separators=(",", ":"))
1052+
snapshot = json_dumps(manager.plugins_snapshot)
10521053
if not manager.metastore.write(PLUGIN_SNAPSHOT_FILE, snapshot):
10531054
manager.errors.set_file(_cache_dir_prefix(manager.options), None, manager.options)
10541055
manager.errors.report(0, 0, "Error writing plugins snapshot", blocker=True)
@@ -1079,8 +1080,8 @@ def read_quickstart_file(
10791080
# just ignore it.
10801081
raw_quickstart: dict[str, Any] = {}
10811082
try:
1082-
with open(options.quickstart_file) as f:
1083-
raw_quickstart = json.load(f)
1083+
with open(options.quickstart_file, "rb") as f:
1084+
raw_quickstart = json_loads(f.read())
10841085

10851086
quickstart = {}
10861087
for file, (x, y, z) in raw_quickstart.items():
@@ -1148,10 +1149,10 @@ def _load_json_file(
11481149
manager.add_stats(metastore_read_time=time.time() - t0)
11491150
# Only bother to compute the log message if we are logging it, since it could be big
11501151
if manager.verbosity() >= 2:
1151-
manager.trace(log_success + data.rstrip())
1152+
manager.trace(log_success + data.rstrip().decode())
11521153
try:
11531154
t1 = time.time()
1154-
result = json.loads(data)
1155+
result = json_loads(data)
11551156
manager.add_stats(data_json_load_time=time.time() - t1)
11561157
except json.JSONDecodeError:
11571158
manager.errors.set_file(file, None, manager.options)
@@ -1343,8 +1344,8 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> CacheMeta | No
13431344
# So that plugins can return data with tuples in it without
13441345
# things silently always invalidating modules, we round-trip
13451346
# the config data. This isn't beautiful.
1346-
plugin_data = json.loads(
1347-
json.dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True)))
1347+
plugin_data = json_loads(
1348+
json_dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True)))
13481349
)
13491350
if m.plugin_data != plugin_data:
13501351
manager.log(f"Metadata abandoned for {id}: plugin configuration differs")
@@ -1478,18 +1479,15 @@ def validate_meta(
14781479
"ignore_all": meta.ignore_all,
14791480
"plugin_data": meta.plugin_data,
14801481
}
1481-
if manager.options.debug_cache:
1482-
meta_str = json.dumps(meta_dict, indent=2, sort_keys=True)
1483-
else:
1484-
meta_str = json.dumps(meta_dict, separators=(",", ":"))
1482+
meta_bytes = json_dumps(meta_dict, manager.options.debug_cache)
14851483
meta_json, _, _ = get_cache_names(id, path, manager.options)
14861484
manager.log(
14871485
"Updating mtime for {}: file {}, meta {}, mtime {}".format(
14881486
id, path, meta_json, meta.mtime
14891487
)
14901488
)
14911489
t1 = time.time()
1492-
manager.metastore.write(meta_json, meta_str) # Ignore errors, just an optimization.
1490+
manager.metastore.write(meta_json, meta_bytes) # Ignore errors, just an optimization.
14931491
manager.add_stats(validate_update_time=time.time() - t1, validate_munging_time=t1 - t0)
14941492
return meta
14951493

@@ -1507,13 +1505,6 @@ def compute_hash(text: str) -> str:
15071505
return hash_digest(text.encode("utf-8"))
15081506

15091507

1510-
def json_dumps(obj: Any, debug_cache: bool) -> str:
1511-
if debug_cache:
1512-
return json.dumps(obj, indent=2, sort_keys=True)
1513-
else:
1514-
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
1515-
1516-
15171508
def write_cache(
15181509
id: str,
15191510
path: str,
@@ -1566,8 +1557,8 @@ def write_cache(
15661557

15671558
# Serialize data and analyze interface
15681559
data = tree.serialize()
1569-
data_str = json_dumps(data, manager.options.debug_cache)
1570-
interface_hash = compute_hash(data_str)
1560+
data_bytes = json_dumps(data, manager.options.debug_cache)
1561+
interface_hash = hash_digest(data_bytes)
15711562

15721563
plugin_data = manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=False))
15731564

@@ -1591,7 +1582,7 @@ def write_cache(
15911582
manager.trace(f"Interface for {id} is unchanged")
15921583
else:
15931584
manager.trace(f"Interface for {id} has changed")
1594-
if not metastore.write(data_json, data_str):
1585+
if not metastore.write(data_json, data_bytes):
15951586
# Most likely the error is the replace() call
15961587
# (see https://github.com/python/mypy/issues/3215).
15971588
manager.log(f"Error writing data JSON file {data_json}")
@@ -3568,4 +3559,4 @@ def write_undocumented_ref_info(
35683559
assert not ref_info_file.startswith(".")
35693560

35703561
deps_json = get_undocumented_ref_info_json(state.tree, type_map)
3571-
metastore.write(ref_info_file, json.dumps(deps_json, separators=(",", ":")))
3562+
metastore.write(ref_info_file, json_dumps(deps_json))

mypy/metastore.py

+16-23
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ def getmtime(self, name: str) -> float:
3333
"""
3434

3535
@abstractmethod
36-
def read(self, name: str) -> str:
36+
def read(self, name: str) -> bytes:
3737
"""Read the contents of a metadata entry.
3838
3939
Raises FileNotFound if the entry does not exist.
4040
"""
4141

4242
@abstractmethod
43-
def write(self, name: str, data: str, mtime: float | None = None) -> bool:
43+
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
4444
"""Write a metadata entry.
4545
4646
If mtime is specified, set it as the mtime of the entry. Otherwise,
@@ -86,16 +86,16 @@ def getmtime(self, name: str) -> float:
8686

8787
return int(os.path.getmtime(os.path.join(self.cache_dir_prefix, name)))
8888

89-
def read(self, name: str) -> str:
89+
def read(self, name: str) -> bytes:
9090
assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!"
9191

9292
if not self.cache_dir_prefix:
9393
raise FileNotFoundError()
9494

95-
with open(os.path.join(self.cache_dir_prefix, name)) as f:
95+
with open(os.path.join(self.cache_dir_prefix, name), "rb") as f:
9696
return f.read()
9797

98-
def write(self, name: str, data: str, mtime: float | None = None) -> bool:
98+
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
9999
assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!"
100100

101101
if not self.cache_dir_prefix:
@@ -105,7 +105,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool:
105105
tmp_filename = path + "." + random_string()
106106
try:
107107
os.makedirs(os.path.dirname(path), exist_ok=True)
108-
with open(tmp_filename, "w") as f:
108+
with open(tmp_filename, "wb") as f:
109109
f.write(data)
110110
os.replace(tmp_filename, path)
111111
if mtime is not None:
@@ -135,27 +135,20 @@ def list_all(self) -> Iterable[str]:
135135

136136

137137
SCHEMA = """
138-
CREATE TABLE IF NOT EXISTS files (
138+
CREATE TABLE IF NOT EXISTS files2 (
139139
path TEXT UNIQUE NOT NULL,
140140
mtime REAL,
141-
data TEXT
141+
data BLOB
142142
);
143-
CREATE INDEX IF NOT EXISTS path_idx on files(path);
143+
CREATE INDEX IF NOT EXISTS path_idx on files2(path);
144144
"""
145-
# No migrations yet
146-
MIGRATIONS: list[str] = []
147145

148146

149147
def connect_db(db_file: str) -> sqlite3.Connection:
150148
import sqlite3.dbapi2
151149

152150
db = sqlite3.dbapi2.connect(db_file)
153151
db.executescript(SCHEMA)
154-
for migr in MIGRATIONS:
155-
try:
156-
db.executescript(migr)
157-
except sqlite3.OperationalError:
158-
pass
159152
return db
160153

161154

@@ -176,7 +169,7 @@ def _query(self, name: str, field: str) -> Any:
176169
if not self.db:
177170
raise FileNotFoundError()
178171

179-
cur = self.db.execute(f"SELECT {field} FROM files WHERE path = ?", (name,))
172+
cur = self.db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,))
180173
results = cur.fetchall()
181174
if not results:
182175
raise FileNotFoundError()
@@ -188,12 +181,12 @@ def getmtime(self, name: str) -> float:
188181
assert isinstance(mtime, float)
189182
return mtime
190183

191-
def read(self, name: str) -> str:
184+
def read(self, name: str) -> bytes:
192185
data = self._query(name, "data")
193-
assert isinstance(data, str)
186+
assert isinstance(data, bytes)
194187
return data
195188

196-
def write(self, name: str, data: str, mtime: float | None = None) -> bool:
189+
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
197190
import sqlite3
198191

199192
if not self.db:
@@ -202,7 +195,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool:
202195
if mtime is None:
203196
mtime = time.time()
204197
self.db.execute(
205-
"INSERT OR REPLACE INTO files(path, mtime, data) VALUES(?, ?, ?)",
198+
"INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)",
206199
(name, mtime, data),
207200
)
208201
except sqlite3.OperationalError:
@@ -213,13 +206,13 @@ def remove(self, name: str) -> None:
213206
if not self.db:
214207
raise FileNotFoundError()
215208

216-
self.db.execute("DELETE FROM files WHERE path = ?", (name,))
209+
self.db.execute("DELETE FROM files2 WHERE path = ?", (name,))
217210

218211
def commit(self) -> None:
219212
if self.db:
220213
self.db.commit()
221214

222215
def list_all(self) -> Iterable[str]:
223216
if self.db:
224-
for row in self.db.execute("SELECT path FROM files"):
217+
for row in self.db.execute("SELECT path FROM files2"):
225218
yield row[0]

0 commit comments

Comments
 (0)