jaguar is a project aiming at adding asynchronous support to the Tiger Compiler.

The official repository is located here.



The project has been done for the PRPA class, (parallel programming). The code we want to achieve at the end is something similar to this:

  var buf := async read(10) /* Asynchronous call */
in                          /* This happens in parallel with the read.       */
  for i := 0 to 300000 do   /* ...                                           */
    ();                     /* ...                                           */
                            /* ...                                           */
  print_int(buf)            /* Here, we wait for the asynchronous call to be */
                            /* finished, and return the value.               */


The following grammar changes have been made:

# Function call.
| async id ( [ exp { , exp }] )
# Method call.
| async lvalue . id ( [ exp { , exp }] )

These changes allow usage of the following code:

var buf := async read(10)
var ouf := async


The implementation has been done at the LLVM IR level, in the Tiger Compiler (TC-L).

The perfect usage of LLVM would be to use it as an external library, and provide a non-intrusive usage for our front-end.

But, we can’t use LLVM IR in order to lower the calls, since we need to do some target-specific operations, like push, regarding the calling convention.

First, we’ll need a MachineFunctionPass, which allows us to use MachineInstrs in order to modify the current function, so to implement the intrinsics.

The problem is that we can’t load a MachineFunctionPass as a dynamic pass, as we can do with a FunctionPass, so, we need to modify the X86 backend.

The repository containing the modified LLVM implementation is here.


The approach we took for an asynchronous task is the following:

  • Spawn a new POSIX thread (pthreads API)
  • Keep track of the handle of the thread.
  • Join the thread when the result of the function is used.

Let’s take an example of an use-case of the async call.

  /* This function computes an useless result. */
  primitive compute(v : int) : int

  /* Var containing the asynchronous result of the computation. */
  var async_result := async compute(300)

  /* Var containing the synchronous result of the computation. */
  var result := compute(300)
  /* Sync. */

  /* Async. */
; ModuleID = 'basic.ll'
target triple = "i386-unknown-linux-gnu"

; Function Attrs: inlinehint nounwind
declare void @tc_print_int(i32) #0

; Function Attrs: nounwind
define void @tc_main() #1 {
  %result_19 = alloca i32
  %async_result_18 = alloca i32

  %async_result_thread = call i32 (i8* (...)*, i32, ...) @tc_async_call(i8* (...)* bitcast (i32 (i32)* @tc_compute to i8* (...)*), i32 1, i32 300)

  %call_compute = call i32 @tc_compute(i32 300)
  store i32 %call_compute, i32* %result_19
  %result_191 = load i32, i32* %result_19
  call void @tc_print_int(i32 %result_191)
  %0 = bitcast i32* %async_result_18 to i8**

  call void @tc_async_return(i32 %async_result_thread, i8** %0)

  %async_result_182 = load i32, i32* %async_result_18
  call void @tc_print_int(i32 %async_result_182)
  ret void

; Function Attrs: inlinehint nounwind
declare i32 @tc_compute(i32) #0

declare i32 @tc_async_call(i8* (...)*, i32, ...)

declare void @tc_async_return(i32, i8**)

attributes #0 = { inlinehint nounwind }
attributes #1 = { nounwind }


So, the following routines have been added to the runtime:

First, added to the C runtime:

typedef void *(*function_t)();

struct async_function
  function_t f;
  size_t nb_args;
  void *args[0];

/** \name Internal functions (calls generated by the compiler only). */
/** \{ */

/** \brief Call a function in a separate thread.
    \param f       The callee function.
    \param nb_args The number of arguments the function is taking.
    \param ...     The arguments to be passed to the function.

    An element size is always the size of a word on 32-bit systems.
pthread_t tc_async_call(function_t f, int nb_args, ...)
  // Prepare the argument list.
  va_list ap;
  va_start(ap, nb_args);

  // Allocate heap space for the structure followed by the arguments.
  struct async_function *a = malloc(sizeof (struct async_function)
                                    + nb_args * sizeof (void *));
  a->nb_args = nb_args;
  a->f = f;

  // Fill the arguments.
  for (int i = 0; i < nb_args; ++i)
    a->args[i] = va_arg(ap, void *);

  // Cleanup the argument list.

  // Create a thread with the intermediate function call.
  pthread_t thread;
  int res = pthread_create(&thread, NULL, &tc_async_wrapper, a);
  assert(!res && "Failed to create a thread.");
  return thread;
/** \} */

/** \name Internal functions (calls generated by the compiler only). */
/** \{ */

/** \brief Waits for the result of a task and sets the result.
    \param thread  The thread id that should be joined.
    \param result  The result memory slot.

    An element size is always the size of a word on 32-bit systems.
void tc_async_return(pthread_t thread, void **result)
  pthread_join(thread, result);
/** \} */

then, the rest of it added to the LLVM IR runtime:

%struct.async_function = type { i8* (...)*, i32, [0 x i8*] }

declare i8* @llvm.tc_async_call(i8* (...)* %f, i32 %nb_args, i8** %args) #1

; Function Attrs: nounwind
define i8* @tc_async_wrapper(i8* %arg) #0 {
  %fun = bitcast i8* %arg to %struct.async_function*
  %pf = getelementptr inbounds %struct.async_function,
             %struct.async_function* %fun, i32 0, i32 0
  %pnb_args = getelementptr inbounds %struct.async_function,
             %struct.async_function* %fun, i32 0, i32 1
  %pargs = getelementptr inbounds %struct.async_function,
             %struct.async_function* %fun, i32 0, i32 2
  %args = bitcast [0 x i8*]* %pargs to i8**
  %f = load i8* (...)*, i8* (...)** %pf, align 4
  %nb_args = load i32, i32* %pnb_args, align 4

  %result = call i8* @llvm.tc_async_call(i8* (...)* %f, i32 %nb_args,
                                          i8** %args)
  ret i8* %result

attributes #0 = { nounwind "disable-tail-calls"="false" "no-frame-pointer-elim"="true" }
attributes #1 = { nounwind }

Future work

Thread pool

Another solution is to provide a thread pool in Tiger’s runtime, available using compiler intrinsics, translated during TC-L.

Resumable routines

The best solution is to handle the routines as resumable routines.

This means that the routines can be paused, the context saved, and replaced with another routine, etc.

This implies that the routines should contain some kind of context, some kind of state that allows the compiler to define where to resume.