@@ -130,18 +130,20 @@ mark_executable(unsigned char *memory, size_t size)
130130
131131// JIT compiler stuff: /////////////////////////////////////////////////////////
132132
133- #define SYMBOL_MASK_WORDS 4
133+ #define GOT_SLOT_SIZE sizeof(uintptr_t)
134+ #define SYMBOL_MASK_WORDS 8
134135
135136typedef uint32_t symbol_mask [SYMBOL_MASK_WORDS ];
136137
137138typedef struct {
138139 unsigned char * mem ;
139140 symbol_mask mask ;
140141 size_t size ;
141- } trampoline_state ;
142+ } symbol_state ;
142143
143144typedef struct {
144- trampoline_state trampolines ;
145+ symbol_state trampolines ;
146+ symbol_state got_symbols ;
145147 uintptr_t instruction_starts [UOP_MAX_TRACE_LENGTH ];
146148} jit_state ;
147149
@@ -205,6 +207,33 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
205207// - x86_64-unknown-linux-gnu:
206208// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp
207209
210+
211+ // Get the symbol slot memory location for a given symbol ordinal.
212+ static unsigned char *
213+ get_symbol_slot (int ordinal , symbol_state * state , int size )
214+ {
215+ const uint32_t symbol_mask = 1U << (ordinal % 32 );
216+ const uint32_t state_mask = state -> mask [ordinal / 32 ];
217+ assert (symbol_mask & state_mask );
218+
219+ // Count the number of set bits in the symbol mask lower than ordinal
220+ size_t index = _Py_popcount32 (state_mask & (symbol_mask - 1 ));
221+ for (int i = 0 ; i < ordinal / 32 ; i ++ ) {
222+ index += _Py_popcount32 (state -> mask [i ]);
223+ }
224+
225+ unsigned char * slot = state -> mem + index * size ;
226+ assert ((size_t )(index + 1 ) * size <= state -> size );
227+ return slot ;
228+ }
229+
230+ // Return the address of the GOT slot for the requested symbol ordinal.
231+ static uintptr_t
232+ got_symbol_address (int ordinal , jit_state * state )
233+ {
234+ return (uintptr_t )get_symbol_slot (ordinal , & state -> got_symbols , GOT_SLOT_SIZE );
235+ }
236+
208237// Many of these patches are "relaxing", meaning that they can rewrite the
209238// code they're patching to be more efficient (like turning a 64-bit memory
210239// load into a 32-bit immediate load). These patches have an "x" in their name.
@@ -447,6 +476,7 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value)
447476 patch_32r (location , value );
448477}
449478
479+ void patch_got_symbol (jit_state * state , int ordinal );
450480void patch_aarch64_trampoline (unsigned char * location , int ordinal , jit_state * state );
451481void patch_x86_64_trampoline (unsigned char * location , int ordinal , jit_state * state );
452482
@@ -465,23 +495,13 @@ void patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *st
465495 #define DATA_ALIGN 1
466496#endif
467497
468- // Get the trampoline memory location for a given symbol ordinal.
469- static unsigned char *
470- get_trampoline_slot ( int ordinal , jit_state * state )
498+ // Populate the GOT entry for the given symbol ordinal with its resolved address .
499+ void
500+ patch_got_symbol ( jit_state * state , int ordinal )
471501{
472- const uint32_t symbol_mask = 1 << (ordinal % 32 );
473- const uint32_t trampoline_mask = state -> trampolines .mask [ordinal / 32 ];
474- assert (symbol_mask & trampoline_mask );
475-
476- // Count the number of set bits in the trampoline mask lower than ordinal
477- int index = _Py_popcount32 (trampoline_mask & (symbol_mask - 1 ));
478- for (int i = 0 ; i < ordinal / 32 ; i ++ ) {
479- index += _Py_popcount32 (state -> trampolines .mask [i ]);
480- }
481-
482- unsigned char * trampoline = state -> trampolines .mem + index * TRAMPOLINE_SIZE ;
483- assert ((size_t )(index + 1 ) * TRAMPOLINE_SIZE <= state -> trampolines .size );
484- return trampoline ;
502+ uint64_t value = (uintptr_t )symbols_map [ordinal ];
503+ unsigned char * location = (unsigned char * )get_symbol_slot (ordinal , & state -> got_symbols , GOT_SLOT_SIZE );
504+ patch_64 (location , value );
485505}
486506
487507// Generate and patch AArch64 trampolines. The symbols to jump to are stored
@@ -501,8 +521,7 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state)
501521 }
502522
503523 // Out of range - need a trampoline
504- uint32_t * p = (uint32_t * )get_trampoline_slot (ordinal , state );
505-
524+ uint32_t * p = (uint32_t * )get_symbol_slot (ordinal , & state -> trampolines , TRAMPOLINE_SIZE );
506525
507526 /* Generate the trampoline
508527 0: 58000048 ldr x8, 8
@@ -532,7 +551,7 @@ patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *state)
532551 }
533552
534553 // Out of range - need a trampoline
535- unsigned char * trampoline = get_trampoline_slot (ordinal , state );
554+ unsigned char * trampoline = get_symbol_slot (ordinal , & state -> trampolines , TRAMPOLINE_SIZE );
536555
537556 /* Generate the trampoline (14 bytes, padded to 16):
538557 0: ff 25 00 00 00 00 jmp *(%rip)
@@ -574,21 +593,26 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
574593 code_size += group -> code_size ;
575594 data_size += group -> data_size ;
576595 combine_symbol_mask (group -> trampoline_mask , state .trampolines .mask );
596+ combine_symbol_mask (group -> got_mask , state .got_symbols .mask );
577597 }
578598 group = & stencil_groups [_FATAL_ERROR ];
579599 code_size += group -> code_size ;
580600 data_size += group -> data_size ;
581601 combine_symbol_mask (group -> trampoline_mask , state .trampolines .mask );
602+ combine_symbol_mask (group -> got_mask , state .got_symbols .mask );
582603 // Calculate the size of the trampolines required by the whole trace
583604 for (size_t i = 0 ; i < Py_ARRAY_LENGTH (state .trampolines .mask ); i ++ ) {
584605 state .trampolines .size += _Py_popcount32 (state .trampolines .mask [i ]) * TRAMPOLINE_SIZE ;
585606 }
607+ for (size_t i = 0 ; i < Py_ARRAY_LENGTH (state .got_symbols .mask ); i ++ ) {
608+ state .got_symbols .size += _Py_popcount32 (state .got_symbols .mask [i ]) * GOT_SLOT_SIZE ;
609+ }
586610 // Round up to the nearest page:
587611 size_t page_size = get_page_size ();
588612 assert ((page_size & (page_size - 1 )) == 0 );
589613 size_t code_padding = DATA_ALIGN - ((code_size + state .trampolines .size ) & (DATA_ALIGN - 1 ));
590- size_t padding = page_size - ((code_size + state .trampolines .size + code_padding + data_size ) & (page_size - 1 ));
591- size_t total_size = code_size + state .trampolines .size + code_padding + data_size + padding ;
614+ size_t padding = page_size - ((code_size + state .trampolines .size + code_padding + data_size + state . got_symbols . size ) & (page_size - 1 ));
615+ size_t total_size = code_size + state .trampolines .size + code_padding + data_size + state . got_symbols . size + padding ;
592616 unsigned char * memory = jit_alloc (total_size );
593617 if (memory == NULL ) {
594618 return -1 ;
@@ -598,6 +622,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
598622 OPT_STAT_ADD (jit_code_size , code_size );
599623 OPT_STAT_ADD (jit_trampoline_size , state .trampolines .size );
600624 OPT_STAT_ADD (jit_data_size , data_size );
625+ OPT_STAT_ADD (jit_got_size , state .got_symbols .size );
601626 OPT_STAT_ADD (jit_padding_size , padding );
602627 OPT_HIST (total_size , trace_total_memory_hist );
603628 // Update the offsets of each instruction:
@@ -608,6 +633,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
608633 unsigned char * code = memory ;
609634 state .trampolines .mem = memory + code_size ;
610635 unsigned char * data = memory + code_size + state .trampolines .size + code_padding ;
636+ state .got_symbols .mem = data + data_size ;
611637 assert (trace [0 ].opcode == _START_EXECUTOR || trace [0 ].opcode == _COLD_EXIT || trace [0 ].opcode == _COLD_DYNAMIC_EXIT );
612638 for (size_t i = 0 ; i < length ; i ++ ) {
613639 const _PyUOpInstruction * instruction = & trace [i ];
@@ -649,19 +675,21 @@ compile_trampoline(void)
649675 code_size += group -> code_size ;
650676 data_size += group -> data_size ;
651677 combine_symbol_mask (group -> trampoline_mask , state .trampolines .mask );
678+ combine_symbol_mask (group -> got_mask , state .got_symbols .mask );
652679 // Round up to the nearest page:
653680 size_t page_size = get_page_size ();
654681 assert ((page_size & (page_size - 1 )) == 0 );
655682 size_t code_padding = DATA_ALIGN - ((code_size + state .trampolines .size ) & (DATA_ALIGN - 1 ));
656- size_t padding = page_size - ((code_size + state .trampolines .size + code_padding + data_size ) & (page_size - 1 ));
657- size_t total_size = code_size + state .trampolines .size + code_padding + data_size + padding ;
683+ size_t padding = page_size - ((code_size + state .trampolines .size + code_padding + data_size + state . got_symbols . size ) & (page_size - 1 ));
684+ size_t total_size = code_size + state .trampolines .size + code_padding + data_size + state . got_symbols . size + padding ;
658685 unsigned char * memory = jit_alloc (total_size );
659686 if (memory == NULL ) {
660687 return NULL ;
661688 }
662689 unsigned char * code = memory ;
663690 state .trampolines .mem = memory + code_size ;
664691 unsigned char * data = memory + code_size + state .trampolines .size + code_padding ;
692+ state .got_symbols .mem = data + data_size ;
665693 // Compile the shim, which handles converting between the native
666694 // calling convention and the calling convention used by jitted code
667695 // (which may be different for efficiency reasons).
0 commit comments