Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Include/cpython/pystate.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ struct _ts {
/* Pointer to currently executing frame. */
struct _PyInterpreterFrame *current_frame;

struct _PyInterpreterFrame *last_profiled_frame;

Py_tracefunc c_profilefunc;
Py_tracefunc c_tracefunc;
PyObject *c_profileobj;
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_debug_offsets.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ typedef struct _Py_DebugOffsets {
uint64_t next;
uint64_t interp;
uint64_t current_frame;
uint64_t last_profiled_frame;
uint64_t thread_id;
uint64_t native_thread_id;
uint64_t datastack_chunk;
Expand Down Expand Up @@ -272,6 +273,7 @@ typedef struct _Py_DebugOffsets {
.next = offsetof(PyThreadState, next), \
.interp = offsetof(PyThreadState, interp), \
.current_frame = offsetof(PyThreadState, current_frame), \
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
.thread_id = offsetof(PyThreadState, thread_id), \
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_global_objects_fini_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Include/internal/pycore_global_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(c_parameter_type)
STRUCT_FOR_ID(c_return)
STRUCT_FOR_ID(c_stack)
STRUCT_FOR_ID(cache_frames)
STRUCT_FOR_ID(cached_datetime_module)
STRUCT_FOR_ID(cached_statements)
STRUCT_FOR_ID(cadata)
Expand Down Expand Up @@ -776,6 +777,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(stacklevel)
STRUCT_FOR_ID(start)
STRUCT_FOR_ID(statement)
STRUCT_FOR_ID(stats)
STRUCT_FOR_ID(status)
STRUCT_FOR_ID(stderr)
STRUCT_FOR_ID(stdin)
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_runtime_init_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions Include/internal/pycore_unicodeobject_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions InternalDocs/frames.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,26 @@ The shim frame points to a special code object containing the `INTERPRETER_EXIT`
instruction which cleans up the shim frame and returns.


### Remote Profiling Frame Cache

The `last_profiled_frame` field in `PyThreadState` supports an optimization for
remote profilers that sample call stacks from external processes. When a remote
profiler reads the call stack, it writes the current frame address to this field.
The eval loop then keeps this pointer valid by updating it to the parent frame
whenever a frame returns (in `_PyEval_FrameClearAndPop`).

This creates a "high-water mark" that always points to a frame still on the stack.
On subsequent samples, the profiler can walk from `current_frame` until it reaches
`last_profiled_frame`, knowing that frames from that point downward are unchanged
and can be retrieved from a cache. This significantly reduces the amount of remote
memory reads needed when call stacks are deep and stable at their base.

The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
`last_profiled_frame` is non-NULL AND matches the frame being popped. This
prevents transient frames (called and returned between profiler samples) from
corrupting the cache pointer, while avoiding any overhead when profiling is inactive.


### The Instruction Pointer

`_PyInterpreterFrame` has two fields which are used to maintain the instruction
Expand Down
112 changes: 102 additions & 10 deletions Lib/profiling/sampling/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,24 @@


class SampleProfiler:
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True):
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True, collect_stats=False):
self.pid = pid
self.sample_interval_usec = sample_interval_usec
self.all_threads = all_threads
self.mode = mode # Store mode for later use
self.collect_stats = collect_stats
if _FREE_THREADED_BUILD:
self.unwinder = _remote_debugging.RemoteUnwinder(
self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
stats=collect_stats
)
else:
only_active_threads = bool(self.all_threads)
self.unwinder = _remote_debugging.RemoteUnwinder(
self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
stats=collect_stats
)
# Track sample intervals and total sample count
self.sample_intervals = deque(maxlen=100)
Expand Down Expand Up @@ -124,6 +127,10 @@ def sample(self, collector, duration_sec=10):
print(f"Sample rate: {sample_rate:.2f} samples/sec")
print(f"Error rate: {error_rate:.2f}%")

# Print unwinder stats if stats collection is enabled
if self.collect_stats:
self._print_unwinder_stats()

# Pass stats to flamegraph collector if it's the right type
if hasattr(collector, 'set_stats'):
collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
Expand Down Expand Up @@ -171,17 +178,100 @@ def _print_realtime_stats(self):
(1.0 / min_hz) * 1_000_000 if min_hz > 0 else 0
) # Max time = Min Hz

# Build cache stats string if stats collection is enabled
cache_stats_str = ""
if self.collect_stats:
try:
stats = self.unwinder.get_stats()
hits = stats.get('frame_cache_hits', 0)
partial = stats.get('frame_cache_partial_hits', 0)
misses = stats.get('frame_cache_misses', 0)
total = hits + partial + misses
if total > 0:
hit_pct = (hits + partial) / total * 100
cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
except RuntimeError:
pass

# Clear line and print stats
print(
f"\r\033[K{ANSIColors.BOLD_BLUE}Real-time sampling stats:{ANSIColors.RESET} "
f"{ANSIColors.YELLOW}Mean: {mean_hz:.1f}Hz ({mean_us_per_sample:.2f}µs){ANSIColors.RESET} "
f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz ({max_us_per_sample:.2f}µs){ANSIColors.RESET} "
f"{ANSIColors.RED}Max: {max_hz:.1f}Hz ({min_us_per_sample:.2f}µs){ANSIColors.RESET} "
f"{ANSIColors.CYAN}Samples: {self.total_samples}{ANSIColors.RESET}",
f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} "
f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} "
f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} "
f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} "
f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}"
f"{cache_stats_str}",
end="",
flush=True,
)

def _print_unwinder_stats(self):
"""Print unwinder statistics including cache performance."""
try:
stats = self.unwinder.get_stats()
except RuntimeError:
return # Stats not enabled

print(f"\n{ANSIColors.BOLD_BLUE}{'='*50}{ANSIColors.RESET}")
print(f"{ANSIColors.BOLD_BLUE}Unwinder Statistics:{ANSIColors.RESET}")

# Frame cache stats
total_samples = stats.get('total_samples', 0)
frame_cache_hits = stats.get('frame_cache_hits', 0)
frame_cache_partial_hits = stats.get('frame_cache_partial_hits', 0)
frame_cache_misses = stats.get('frame_cache_misses', 0)
total_lookups = frame_cache_hits + frame_cache_partial_hits + frame_cache_misses

# Calculate percentages
hits_pct = (frame_cache_hits / total_lookups * 100) if total_lookups > 0 else 0
partial_pct = (frame_cache_partial_hits / total_lookups * 100) if total_lookups > 0 else 0
misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0

print(f" {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}")
print(f" Total samples: {total_samples:,}")
print(f" Full hits: {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})")
print(f" Partial hits: {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})")
print(f" Misses: {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})")

# Frame read stats
frames_from_cache = stats.get('frames_read_from_cache', 0)
frames_from_memory = stats.get('frames_read_from_memory', 0)
total_frames = frames_from_cache + frames_from_memory
cache_frame_pct = (frames_from_cache / total_frames * 100) if total_frames > 0 else 0
memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0

print(f" {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}")
print(f" From cache: {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})")
print(f" From memory: {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})")

# Code object cache stats
code_hits = stats.get('code_object_cache_hits', 0)
code_misses = stats.get('code_object_cache_misses', 0)
total_code = code_hits + code_misses
code_hits_pct = (code_hits / total_code * 100) if total_code > 0 else 0
code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0

print(f" {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}")
print(f" Hits: {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})")
print(f" Misses: {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})")

# Memory operations
memory_reads = stats.get('memory_reads', 0)
memory_bytes = stats.get('memory_bytes_read', 0)
if memory_bytes >= 1024 * 1024:
memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB"
elif memory_bytes >= 1024:
memory_str = f"{memory_bytes / 1024:.1f} KB"
else:
memory_str = f"{memory_bytes} B"
print(f" {ANSIColors.CYAN}Memory:{ANSIColors.RESET}")
print(f" Read operations: {memory_reads:,} ({memory_str})")

# Stale invalidations
stale_invalidations = stats.get('stale_cache_invalidations', 0)
if stale_invalidations > 0:
print(f" {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")


def sample(
pid,
Expand Down Expand Up @@ -228,7 +318,8 @@ def sample(
mode=mode,
native=native,
gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads,
collect_stats=realtime_stats,
)
profiler.realtime_stats = realtime_stats

Expand Down Expand Up @@ -283,7 +374,8 @@ def sample_live(
mode=mode,
native=native,
gc=gc,
skip_non_matching_threads=skip_non_matching_threads
skip_non_matching_threads=skip_non_matching_threads,
collect_stats=realtime_stats,
)
profiler.realtime_stats = realtime_stats

Expand Down
Loading
Loading