diff --git a/gcc/calls.cc b/gcc/calls.cc
index 6dd6f73e9780..f0261fcc6711 100644
--- a/gcc/calls.cc
+++ b/gcc/calls.cc
@@ -1367,7 +1367,8 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
 	 with those made by function.cc.  */
 
       /* See if this argument should be passed by invisible reference.  */
-      function_arg_info arg (type, argpos < n_named_args);
+      function_arg_info arg (type, argpos < n_named_args,
+			     argpos == n_named_args - 1);
       if (pass_by_reference (args_so_far_pnt, arg))
 	{
 	  const bool callee_copies
@@ -1540,6 +1541,7 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
 #endif
 			     reg_parm_stack_space,
 			     args[i].pass_on_stack ? 0 : args[i].partial,
+			     args_so_far,
 			     fndecl, args_size, &args[i].locate);
 #ifdef BLOCK_REG_PADDING
       else
@@ -4256,6 +4258,7 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
 			   argvec[count].reg != 0,
 #endif
 			   reg_parm_stack_space, 0,
+			   args_so_far,
 			   NULL_TREE, &args_size, &argvec[count].locate);
 
       if (argvec[count].reg == 0 || argvec[count].partial != 0
@@ -4347,6 +4350,7 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
 			       argvec[count].reg != 0,
 #endif
 			       reg_parm_stack_space, argvec[count].partial,
+			       args_so_far,
 			       NULL_TREE, &args_size, &argvec[count].locate);
 	  args_size.constant += argvec[count].locate.size.constant;
 	  gcc_assert (!argvec[count].locate.size.var);
diff --git a/gcc/calls.h b/gcc/calls.h
index fd7836e481d0..a2a4f85e23ec 100644
--- a/gcc/calls.h
+++ b/gcc/calls.h
@@ -35,24 +35,43 @@ class function_arg_info
 {
 public:
   function_arg_info ()
-    : type (NULL_TREE), mode (VOIDmode), named (false),
+    : type (NULL_TREE), mode (VOIDmode), named (false), last_named (false),
       pass_by_reference (false)
   {}
 
   /* Initialize an argument of mode MODE, either before or after promotion.  */
   function_arg_info (machine_mode mode, bool named)
-    : type (NULL_TREE), mode (mode), named (named), pass_by_reference (false)
+    : type (NULL_TREE), mode (mode), named (named), last_named (false),
+      pass_by_reference (false)
+  {}
+
+  function_arg_info (machine_mode mode, bool named, bool last_named)
+    : type (NULL_TREE), mode (mode), named (named), last_named (last_named),
+      pass_by_reference (false)
   {}
 
   /* Initialize an unpromoted argument of type TYPE.  */
   function_arg_info (tree type, bool named)
-    : type (type), mode (TYPE_MODE (type)), named (named),
+    : type (type), mode (TYPE_MODE (type)), named (named), last_named (false),
       pass_by_reference (false)
   {}
 
+  /* Initialize an unpromoted argument of type TYPE.  */
+  function_arg_info (tree type, bool named, bool last_named)
+    : type (type), mode (TYPE_MODE (type)), named (named),
+      last_named (last_named), pass_by_reference (false)
+  {}
+
   /* Initialize an argument with explicit properties.  */
   function_arg_info (tree type, machine_mode mode, bool named)
-    : type (type), mode (mode), named (named), pass_by_reference (false)
+    : type (type), mode (mode), named (named), last_named (false),
+      pass_by_reference (false)
+  {}
+
+  /* Initialize an argument with explicit properties.  */
+  function_arg_info (tree type, machine_mode mode, bool named, bool last_named)
+    : type (type), mode (mode), named (named), last_named (last_named),
+      pass_by_reference (false)
   {}
 
   /* Return true if the gimple-level type is an aggregate.  */
@@ -105,6 +124,9 @@ class function_arg_info
      "...").  See also TARGET_STRICT_ARGUMENT_NAMING.  */
   unsigned int named : 1;
 
+  /* True if this is the last named argument. */
+  unsigned int last_named : 1;
+
   /* True if we have decided to pass the argument by reference, in which case
      the function_arg_info describes a pointer to the original argument.  */
   unsigned int pass_by_reference : 1;
diff --git a/gcc/common.opt b/gcc/common.opt
index 016bd76354f1..b01f7a7a4a22 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2790,6 +2790,10 @@ fstack-usage
 Common RejectNegative Var(flag_stack_usage)
 Output stack usage information on a per-function basis.
 
+fstack-use-cumulative-args
+Common RejectNegative Var(flag_stack_use_cumulative_args) Init(STACK_USE_CUMULATIVE_ARGS_INIT)
+Use cumulative args-based stack layout hooks.
+
 fstrength-reduce
 Common Ignore
 Does nothing.  Preserved for backward compatibility.
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 3b04362df7ce..25694fddff5e 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -1112,13 +1112,22 @@ case ${target} in
   ;;
 esac
 
+# Defaults that need fixing.
 # Figure out if we need to enable -foff-stack-trampolines by default
 case ${target} in
+aarch64*-*-darwin2*)
+  # This only applies to arm64 Darwin variadic funtions.
+  tm_defines="$tm_defines STACK_USE_CUMULATIVE_ARGS_INIT=1"
+  # This is required; executable stack is forbidden.
+  tm_defines="$tm_defines OFF_STACK_TRAMPOLINES_INIT=1"
+  ;;
 *-*-darwin2*)
+  tm_defines="$tm_defines STACK_USE_CUMULATIVE_ARGS_INIT=0"
   # Currently, we do this for macOS 11 and above.
   tm_defines="$tm_defines OFF_STACK_TRAMPOLINES_INIT=1"
   ;;
 *)
+  tm_defines="$tm_defines STACK_USE_CUMULATIVE_ARGS_INIT=0"
   tm_defines="$tm_defines OFF_STACK_TRAMPOLINES_INIT=0"
   ;;
 esac
@@ -1161,7 +1170,7 @@ aarch64*-*-elf | aarch64*-*-fuchsia* | aarch64*-*-rtems*)
 	done
 	TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
 	;;
-aarch64-*-darwin* | arm64-*-darwin*)
+aarch64-*-darwin* )
 	tm_file="${tm_file} aarch64/aarch64-errata.h"
 	tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-darwin"
 	tm_defines="${tm_defines} TARGET_DEFAULT_ASYNC_UNWIND_TABLES=1"
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 82b3d86c9ffd..34f0260a3689 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -916,6 +916,7 @@ void aarch64_expand_vector_init (rtx, rtx);
 void aarch64_sve_expand_vector_init (rtx, rtx);
 void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
 				   const_tree, unsigned, bool = false);
+void aarch64_init_cumulative_incoming_args (CUMULATIVE_ARGS *, const_tree, rtx);
 void aarch64_init_expanders (void);
 void aarch64_emit_call_insn (rtx);
 void aarch64_register_pragmas (void);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c5723573fac7..d367bd729b01 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4077,8 +4077,10 @@ static bool
 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
 {
   CUMULATIVE_ARGS args_so_far_v;
+  /* This does not apply to variadic functions, so all the (currently
+     uncounted) arguments must be named.  */
   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
-				NULL_TREE, 0, true);
+				NULL_TREE, -1, true);
   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
 
   for (tree chain = TYPE_ARG_TYPES (fntype);
@@ -7659,6 +7661,13 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
     return;
 
   pcum->aapcs_arg_processed = true;
+  if (TARGET_MACHO)
+    {
+      /* Set suitable defaults for queries.  */
+      pcum->darwinpcs_arg_boundary
+	= aarch64_function_arg_alignment (mode, type, &abi_break);
+      pcum->darwinpcs_arg_padding = BITS_PER_UNIT;
+    }
 
   pure_scalable_type_info pst_info;
   if (type && pst_info.analyze_registers (type))
@@ -7715,7 +7724,11 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
     /* No frontends can create types with variable-sized modes, so we
        shouldn't be asked to pass or return them.  */
     size = GET_MODE_SIZE (mode).to_constant ();
-  pcum->darwinpcs_stack_bytes = size;
+
+  if (TARGET_MACHO)
+    /* Since we can pack things on the stack, we need the unrounded size.  */
+    pcum->darwinpcs_stack_bytes = size;
+
   size = ROUND_UP (size, UNITS_PER_WORD);
 
   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
@@ -7784,6 +7797,7 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
 		}
 	      pcum->aapcs_reg = par;
 	    }
+	  pcum->darwinpcs_stack_bytes = 0;
 	  return;
 	}
       else
@@ -7869,8 +7883,8 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
 	    }
 	  pcum->aapcs_reg = par;
 	}
-
       pcum->aapcs_nextncrn = ncrn + nregs;
+      pcum->darwinpcs_stack_bytes = 0;
       return;
     }
 
@@ -7886,20 +7900,24 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
   if (TARGET_MACHO)
     {
       /* Darwin does not round up the allocation for smaller entities to 8
-	 bytes.  It only requires the natural alignment for these.  There
-	 was no darwinpcs for GCC 9, so neither the implementation change
-	 nor the warning should fire here.
-
-	 size is rounded up to 8 bytes, so will account for enough slots to
-	 accommodate the entire argument - potentially, with some padding
-	 at the end.  When the current position is 0 - any allocation needs
-	 a stack slot.  CHECKME: do we need to align 16byte entities?
+	 bytes.  It only requires the natural alignment for these.
 
 	 but we don't do this for:
 	  * unnamed parms in variadic functions
-	  * complex types smaller than 4 bytes
-	 each get their own slot.  */
-      if (!arg.named
+	  * complex types
+	  * unions
+	  * aggregates (except for homogeneous ones which are handles as the
+	    enclosed type).
+	 each entry starts a new slot.
+
+	16 byte entities are naturally aligned on the stack.
+	There was no darwinpcs for GCC 9, so neither the implementation
+	change nor the warning should fire here (i.e. we do not need to check
+	if 16byte entities alter the stack size).  */
+
+gcc_checking_assert (arg.named == pcum->named_p);
+      pcum->darwinpcs_arg_padding = BITS_PER_UNIT;
+      if (!pcum->named_p
 	  || TREE_CODE (type) == COMPLEX_TYPE
 	  || (TREE_CODE (type) == RECORD_TYPE
 	      && !is_ha && !SCALAR_FLOAT_MODE_P (pcum->aapcs_vfp_rmode))
@@ -7908,30 +7926,53 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
 	  pcum->aapcs_stack_words = size / UNITS_PER_WORD;
 	  pcum->darwinpcs_sub_word_offset = 0;
 	  pcum->darwinpcs_sub_word_pos = 0;
-	  /* We skip the re-alignment for 16byte things, since we currently
-	     assume that the darwinpcs doesn't force such alignment.  */
+	  pcum->darwinpcs_arg_boundary = MAX (align, PARM_BOUNDARY);
+	  if (!pcum->named_p)
+	    pcum->darwinpcs_arg_padding = PARM_BOUNDARY;
 	  return;
 	}
 
-      if (pcum->darwinpcs_sub_word_pos == 0)
-	pcum->aapcs_stack_words = size / UNITS_PER_WORD;
-
-      int new_pos
+      /* Updated sub-word offset aligned for the new object.
+	 We are looking for the case that the new object will fit after some
+	 existing object(s) in the same stack slot.  In that case, we do not
+	 need to add any more stack space for it.  */
+      int new_off
 	= ROUND_UP (pcum->darwinpcs_sub_word_pos, align / BITS_PER_UNIT);
-      if (new_pos >= UNITS_PER_WORD)
+
+      if (new_off >= UNITS_PER_WORD)
 	{
-	  /* We are not catering for the possible 16byte alignment bump.  */
-	  pcum->aapcs_stack_words += 1;
-	  new_pos = 0;
+	  /* That exceeds a stack slot, start a new one.  */
+	  pcum->darwinpcs_sub_word_offset = 0;
+	  pcum->darwinpcs_sub_word_pos = 0;
+	  new_off = 0;
+	}
+      /* This is the end of the new object.  */
+      int new_pos = new_off + pcum->darwinpcs_stack_bytes;
+
+      if (pcum->darwinpcs_sub_word_pos == 0)
+	/* New stack slot, just allocate one or more words, and note where
+	  the next arg will start.  */
+	pcum->aapcs_stack_words = size / UNITS_PER_WORD;
+      else if (new_pos <= UNITS_PER_WORD)
+	/* Old stack slot, object starts at new_off and goes to new_pos, we do
+	   not add any stack space.  */
+	pcum->darwinpcs_sub_word_offset = new_off;
+      pcum->darwinpcs_sub_word_pos = new_pos;
+      pcum->darwinpcs_arg_boundary = align;
+      if (pcum->last_named_p && new_pos > 0)
+	{
+	  /* Round the last named arg to the start of the next stack slot.  */
+	  if (new_pos <= 4)
+	    pcum->darwinpcs_arg_padding = PARM_BOUNDARY;
+	  else if (new_pos <= 6)
+	    pcum->darwinpcs_arg_padding = 4 * BITS_PER_UNIT;
+	  else if (pcum->darwinpcs_sub_word_pos <= 7)
+	    pcum->darwinpcs_arg_padding = 2 * BITS_PER_UNIT;
 	}
-      pcum->darwinpcs_sub_word_offset = new_pos;
-      new_pos += pcum->darwinpcs_stack_bytes;
-      if (new_pos > UNITS_PER_WORD)
-	pcum->aapcs_stack_words += new_pos / UNITS_PER_WORD;
-      pcum->darwinpcs_sub_word_pos = new_pos % UNITS_PER_WORD;
       return;
     }
 
+  /* size was already rounded up to PARM_BOUNDARY.  */
   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
   if (align == 16 * BITS_PER_UNIT)
     {
@@ -7989,6 +8030,23 @@ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
   pcum->darwinpcs_stack_bytes = 0;
   pcum->darwinpcs_sub_word_offset = 0;
   pcum->darwinpcs_sub_word_pos = 0;
+  pcum->darwinpcs_arg_boundary = BITS_PER_UNIT;
+  pcum->darwinpcs_arg_padding = BITS_PER_UNIT;
+  /* If we have been invoked for incoming args, then n_named will have been
+     set to -1, but we should have a function decl - so pick up the named
+     count from that.  If that fails, and we end up with -1, this effectively
+     corresponds to assuming that there is an arbitrary number of named
+     args.  */
+  pcum->darwinpcs_n_named = n_named;
+  if (n_named == (unsigned)-1 && fndecl)
+    {
+      tree fnt = TREE_TYPE (fndecl);
+      if (fnt && TYPE_ARG_TYPES (fnt))
+	pcum->darwinpcs_n_named = list_length (TYPE_ARG_TYPES (fnt));
+    }
+  pcum->darwinpcs_n_args_processed = 0;
+  pcum->named_p = pcum->darwinpcs_n_named != 0;
+  pcum->last_named_p = pcum->darwinpcs_n_named == 1;
   pcum->silent_p = silent_p;
   pcum->aapcs_vfp_rmode = VOIDmode;
 
@@ -8029,6 +8087,7 @@ aarch64_function_arg_advance (cumulative_args_t pcum_v,
       || pcum->pcs_variant == ARM_PCS_SVE)
     {
       aarch64_layout_arg (pcum_v, arg);
+      pcum->darwinpcs_n_args_processed++;
       gcc_assert (TARGET_MACHO
 		  || (pcum->aapcs_reg != NULL_RTX)
 		      != (pcum->aapcs_stack_words != 0));
@@ -8039,6 +8098,12 @@ aarch64_function_arg_advance (cumulative_args_t pcum_v,
       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
       pcum->aapcs_stack_words = 0;
       pcum->aapcs_reg = NULL_RTX;
+      pcum->darwinpcs_arg_boundary = BITS_PER_UNIT;
+      pcum->darwinpcs_arg_padding = BITS_PER_UNIT;
+      pcum->named_p
+	= pcum->darwinpcs_n_args_processed < pcum->darwinpcs_n_named;
+      pcum->last_named_p
+	= pcum->darwinpcs_n_args_processed + 1 == pcum->darwinpcs_n_named;
     }
 }
 
@@ -8055,11 +8120,9 @@ aarch64_function_arg_regno_p (unsigned regno)
    if the type requires it.  This makes sure that both before and after
    the layout of each argument, the Next Stacked Argument Address (NSAA)
    will have a minimum alignment of 8 bytes.
-   For darwinpcs, parameters get their natural alignment (up to the
-   STACK_BOUNDARY).  Therefore, the stack can be aligned less than 8
-   bytes after a smaller aligned type is placed.  However, the stack will
-   always be counted in PARM_BOUNDARY chunks, darwinpcs will just fill
-   the last allocated chunk with several args, potentially.  */
+
+   For darwinpcs, this is only called to lower va_arg entries which are
+   always aligned as for AAPCS64.  */
 
 static unsigned int
 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
@@ -8068,7 +8131,7 @@ aarch64_function_arg_boundary (machine_mode mode, const_tree type)
   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
 							   &abi_break);
 #if TARGET_MACHO
-  /* Temporary fudge to put some non-scalar types in distinct stack slots.  */
+  /* This can only work for unnamed args.  */
   machine_mode comp_mode = VOIDmode;
   int nregs;
   bool is_ha;
@@ -8085,7 +8148,7 @@ aarch64_function_arg_boundary (machine_mode mode, const_tree type)
   if (abi_break & warn_psabi)
     {
       abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
-      if (alignment != abi_break)
+      if (alignment != abi_break && !TARGET_MACHO)
 	inform (input_location, "parameter passing for argument of type "
 		"%qT changed in GCC 9.1", type);
     }
@@ -8094,17 +8157,93 @@ aarch64_function_arg_boundary (machine_mode mode, const_tree type)
 #endif
 }
 
-#if TARGET_MACHO
-/* Implement TARGET_FUNCTION_ARG_ROUND_BOUNDARY for darwinpcs which allows
-   non-standard passing of byte-aligned items [D.2].
-   TODO: check if this extends to packed aggregates.  */
+/* For Darwin, we want to use the arg boundary computed when laying out the
+   function arg, to cope with items packed on the stack and the different
+   rules applied to unnamed parms.  */
 
 static unsigned int
-aarch64_function_arg_round_boundary (machine_mode, const_tree)
+aarch64_function_arg_boundary_ca (machine_mode mode ATTRIBUTE_UNUSED,
+				  const_tree type ATTRIBUTE_UNUSED,
+				  cumulative_args_t ca ATTRIBUTE_UNUSED)
 {
-  return BITS_PER_UNIT;
-}
+  unsigned int abi_break;
+  unsigned int alignment = aarch64_function_arg_alignment (mode, type,
+							   &abi_break);
+#if TARGET_MACHO
+  CUMULATIVE_ARGS *pcum = get_cumulative_args (ca);
+gcc_checking_assert (pcum->aapcs_arg_processed);
+
+  bool named_p = pcum->darwinpcs_n_args_processed < pcum->darwinpcs_n_named;
+gcc_checking_assert (named_p == pcum->named_p);
+  machine_mode comp_mode = VOIDmode;
+  int nregs;
+  bool is_ha;
+  aarch64_vfp_is_call_or_return_candidate (mode, type, &comp_mode, &nregs,
+					   &is_ha, /*silent*/true);
+  bool no_pack = (TREE_CODE (type) == COMPLEX_TYPE
+      || (TREE_CODE (type) == RECORD_TYPE
+	  && !is_ha && !SCALAR_FLOAT_MODE_P (comp_mode))
+      || TREE_CODE (type) == UNION_TYPE);
+
+  bool in_regs = (pcum->aapcs_reg != NULL_RTX);
+
+  if ((named_p && !no_pack) || in_regs)
+    ; /* Leave the alignment as natural.  */
+  else
+    alignment = MAX (alignment, PARM_BOUNDARY);
+gcc_checking_assert (alignment == pcum->darwinpcs_arg_boundary);
+  return MIN (alignment, STACK_BOUNDARY);
+
+#else
+  alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
+  if (abi_break & warn_psabi)
+    {
+      abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
+      if (alignment != abi_break)
+	inform (input_location, "parameter passing for argument of type "
+		"%qT changed in GCC 9.1", type);
+    }
+
+  return alignment;
 #endif
+}
+
+/* Implement TARGET_FUNCTION_ARG_ROUND_BOUNDARY_CA for darwinpcs which allows
+   non-standard passing of byte-aligned items [D.2].  This is done by pulling
+   the values out of the cumulative args struct.  */
+
+static unsigned int
+aarch64_function_arg_round_boundary_ca (machine_mode mode ATTRIBUTE_UNUSED,
+					const_tree type ATTRIBUTE_UNUSED,
+					cumulative_args_t ca)
+{
+  CUMULATIVE_ARGS *pcum = get_cumulative_args (ca);
+gcc_checking_assert (pcum->aapcs_arg_processed);
+  bool named_p = pcum->darwinpcs_n_args_processed < pcum->darwinpcs_n_named;
+gcc_checking_assert (named_p == pcum->named_p);
+  bool last_named_p = pcum->darwinpcs_n_args_processed + 1 == pcum->darwinpcs_n_named;
+gcc_checking_assert (last_named_p == pcum->last_named_p);
+
+  unsigned boundary = BITS_PER_UNIT;
+  if (last_named_p && pcum->darwinpcs_sub_word_pos > 0)
+    {
+      /* Round the last named arg to the start of the next stack slot.  */
+      if (pcum->darwinpcs_sub_word_pos <= 4)
+	boundary = PARM_BOUNDARY;
+      else if (pcum->darwinpcs_sub_word_pos <= 6)
+	boundary = 4 * BITS_PER_UNIT;
+      else if (pcum->darwinpcs_sub_word_pos <= 7)
+	boundary = 2 * BITS_PER_UNIT;
+    }
+  else if (named_p)
+    /* Named args are naturally aligned, but with no rounding.  */
+    ;
+  else
+    /* un-named args are rounded to fill slots.  */
+    boundary = PARM_BOUNDARY;
+gcc_checking_assert (boundary == pcum->darwinpcs_arg_padding);
+  return boundary;
+}
 
 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
 
@@ -20367,7 +20506,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
   int vr_saved = cfun->va_list_fpr_size;
 
   if (TARGET_MACHO)
-    return;
+    return default_setup_incoming_varargs (cum_v, arg, pretend_size, no_rtl);
 
   /* The caller has advanced CUM up to, but not beyond, the last named
      argument.  Advance a local copy of CUM past the last "real" named
@@ -28103,10 +28242,11 @@ aarch64_run_selftests (void)
 #undef TARGET_FUNCTION_ARG_BOUNDARY
 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
 
-#if TARGET_MACHO
-#undef  TARGET_FUNCTION_ARG_ROUND_BOUNDARY
-#define TARGET_FUNCTION_ARG_ROUND_BOUNDARY aarch64_function_arg_round_boundary
-#endif
+#undef TARGET_FUNCTION_ARG_BOUNDARY_CA
+#define TARGET_FUNCTION_ARG_BOUNDARY_CA aarch64_function_arg_boundary_ca
+
+#undef  TARGET_FUNCTION_ARG_ROUND_BOUNDARY_CA
+#define TARGET_FUNCTION_ARG_ROUND_BOUNDARY_CA aarch64_function_arg_round_boundary_ca
 
 #undef TARGET_FUNCTION_ARG_PADDING
 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 4b04ddd19a42..d3efe9cd4759 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -928,12 +928,24 @@ typedef struct
 				   aapcs_reg == NULL_RTX.  */
   int aapcs_stack_size;		/* The total size (in words, per 8 byte) of the
 				   stack arg area so far.  */
+
+  /* In the darwinpcs, items smaller than one word are packed onto the stack
+     naturally aligned.  Unnamed parameters passed in a variadic call are,
+     however, aligned the same way as the AAPCS64.  This means that we need to
+     pad the last named arg to the next parm boundary (and hence notice when
+     we are processing that arg).  */
   int darwinpcs_stack_bytes;	/* If the argument is passed on the stack, this
 				   the byte-size.  */
   int darwinpcs_sub_word_offset;/* This is the offset of this arg within a word
 				   when placing smaller items for darwinpcs.  */
   int darwinpcs_sub_word_pos;	/* The next byte available within the word for
 				   darwinpcs.  */
+  unsigned darwinpcs_arg_boundary; /* The computed argument boundary.  */
+  unsigned darwinpcs_arg_padding; /* The computed argument padding.  */
+  unsigned darwinpcs_n_named;	/* Number of named arguments.  */
+  unsigned darwinpcs_n_args_processed; /* Processed so far.  */
+  bool named_p;			/* Is this arg named? */
+  bool last_named_p;		/* Is this the last named arg? */
   bool silent_p;		/* True if we should act silently, rather than
 				   raise an error for invalid calls.  */
 } CUMULATIVE_ARGS;
diff --git a/gcc/cumulative-args.h b/gcc/cumulative-args.h
new file mode 100644
index 000000000000..b60928e37f9d
--- /dev/null
+++ b/gcc/cumulative-args.h
@@ -0,0 +1,20 @@
+#ifndef GCC_CUMULATIVE_ARGS_H
+#define GCC_CUMULATIVE_ARGS_H
+
+#if CHECKING_P
+
+struct cumulative_args_t { void *magic; void *p; };
+
+#else /* !CHECKING_P */
+
+/* When using a GCC build compiler, we could use
+   __attribute__((transparent_union)) to get cumulative_args_t function
+   arguments passed like scalars where the ABI would mandate a less
+   efficient way of argument passing otherwise.  However, that would come
+   at the cost of less type-safe !CHECKING_P compilation.  */
+
+union cumulative_args_t { void *p; };
+
+#endif /* !CHECKING_P */
+
+#endif /* GCC_CUMULATIVE_ARGS_H */
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 3af78bb7cb20..1a1d6b074209 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -706,6 +706,7 @@ Objective-C and Objective-C++ Dialects}.
 -fverbose-asm  -fpack-struct[=@var{n}]  @gol
 -fleading-underscore  -ftls-model=@var{model} @gol
 -fstack-reuse=@var{reuse_level} @gol
+-fstack-use-cumulative-args @gol
 -ftrampolines  -ftrapv  -fwrapv @gol
 -fvisibility=@r{[}default@r{|}internal@r{|}hidden@r{|}protected@r{]} @gol
 -fstrict-volatile-bitfields  -fsync-libcalls}
@@ -17971,6 +17972,17 @@ the behavior of older compilers in which temporaries' stack space is
 not reused, the aggressive stack reuse can lead to runtime errors. This
 option is used to control the temporary stack reuse optimization.
 
+@item -fstack-use-cumulative-args
+@opindex fstack_use_cumulative_args
+This option instructs the compiler to use the
+@code{cumulative_args_t}-based stack layout target hooks,
+@code{TARGET_FUNCTION_ARG_BOUNDARY_CA} and
+@code{TARGET_FUNCTION_ARG_ROUND_BOUNDARY_CA}. If a given target does
+not define these hooks, the default behaviour is to fallback to using
+the standard non-@code{_CA} variants instead. Certain targets (such as
+AArch64 Darwin) require using the more advanced @code{_CA}-based
+hooks: For these targets this option should be enabled by default.
+
 @item -ftrapv
 @opindex ftrapv
 This option generates traps for signed overflow on addition, subtraction,
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 8fe49c2ba3db..c75361750dd9 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -4345,6 +4345,16 @@ with the specified mode and type.  The default hook returns
 @code{PARM_BOUNDARY} for all arguments.
 @end deftypefn
 
+@deftypefn {Target Hook} {unsigned int} TARGET_FUNCTION_ARG_BOUNDARY_CA (machine_mode @var{mode}, const_tree @var{type}, cumulative_args_t @var{ca})
+This is the @code{cumulative_args_t}-based version of
+@code{TARGET_FUNCTION_ARG_BOUNDARY}. Define this hook if you need more
+fine-grained control over argument alignment, e.g. depending on whether
+it is a named argument or not, or any other criteria that you choose to
+place in the @var{ca} structure.
+
+The default hook will call @code{TARGET_FUNCTION_ARG_BOUNDARY}.
+@end deftypefn
+
 @deftypefn {Target Hook} {unsigned int} TARGET_FUNCTION_ARG_ROUND_BOUNDARY (machine_mode @var{mode}, const_tree @var{type})
 Normally, the size of an argument is rounded up to @code{PARM_BOUNDARY},
 which is the default value for this hook.  You can define this hook to
@@ -4352,6 +4362,16 @@ return a different value if an argument size must be rounded to a larger
 value.
 @end deftypefn
 
+@deftypefn {Target Hook} {unsigned int} TARGET_FUNCTION_ARG_ROUND_BOUNDARY_CA (machine_mode @var{mode}, const_tree @var{type}, cumulative_args_t @var{ca})
+This is the @code{cumulative_args_t}-based version of
+@code{TARGET_FUNCTION_ARG_ROUND_BOUNDARY}. Define this hook if you need more
+fine-grained control over argument size rounding, e.g. depending on whether
+it is a named argument or not, or any other criteria that you choose to
+place in the @var{ca} structure.
+
+The default hook will call @code{TARGET_FUNCTION_ARG_ROUND_BOUNDARY}.
+@end deftypefn
+
 @defmac FUNCTION_ARG_REGNO_P (@var{regno})
 A C expression that is nonzero if @var{regno} is the number of a hard
 register in which function arguments are sometimes passed.  This does
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 62c49ac46de6..f01ab609a7de 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -3339,8 +3339,12 @@ required.
 
 @hook TARGET_FUNCTION_ARG_BOUNDARY
 
+@hook TARGET_FUNCTION_ARG_BOUNDARY_CA
+
 @hook TARGET_FUNCTION_ARG_ROUND_BOUNDARY
 
+@hook TARGET_FUNCTION_ARG_ROUND_BOUNDARY_CA
+
 @defmac FUNCTION_ARG_REGNO_P (@var{regno})
 A C expression that is nonzero if @var{regno} is the number of a hard
 register in which function arguments are sometimes passed.  This does
diff --git a/gcc/function.cc b/gcc/function.cc
index dc333c27e920..f2497c466127 100644
--- a/gcc/function.cc
+++ b/gcc/function.cc
@@ -2448,7 +2448,10 @@ assign_parm_find_data_types (struct assign_parm_data_all *all, tree parm,
   else if (DECL_CHAIN (parm))
     data->arg.named = 1;  /* Not the last non-variadic parm. */
   else if (targetm.calls.strict_argument_naming (all->args_so_far))
-    data->arg.named = 1;  /* Only variadic ones are unnamed.  */
+    {
+      data->arg.named = 1;  /* Only variadic ones are unnamed.  */
+      data->arg.last_named = 1;
+    }
   else
     data->arg.named = 0;  /* Treat as variadic.  */
 
@@ -2505,6 +2508,7 @@ assign_parms_setup_varargs (struct assign_parm_data_all *all,
 
   function_arg_info last_named_arg = data->arg;
   last_named_arg.named = true;
+  last_named_arg.last_named = true;
   targetm.calls.setup_incoming_varargs (all->args_so_far, last_named_arg,
 					&varargs_pretend_bytes, no_rtl);
 
@@ -2613,7 +2617,9 @@ assign_parm_find_entry_rtl (struct assign_parm_data_all *all,
 
   locate_and_pad_parm (data->arg.mode, data->arg.type, in_regs,
 		       all->reg_parm_stack_space,
-		       entry_parm ? data->partial : 0, current_function_decl,
+		       entry_parm ? data->partial : 0,
+		       all->args_so_far,
+		       current_function_decl,
 		       &all->stack_args_size, &data->locate);
 
   /* Update parm_stack_boundary if this parameter is passed in the
@@ -3944,7 +3950,8 @@ gimplify_parameters (gimple_seq *cleanup)
       if (data.arg.pass_by_reference)
 	{
 	  tree type = TREE_TYPE (data.arg.type);
-	  function_arg_info orig_arg (type, data.arg.named);
+	  function_arg_info orig_arg (type, data.arg.named,
+				      data.arg.last_named);
 	  if (reference_callee_copied (&all.args_so_far_v, orig_arg))
 	    {
 	      tree local, t;
@@ -4047,6 +4054,7 @@ gimplify_parameters (gimple_seq *cleanup)
 void
 locate_and_pad_parm (machine_mode passed_mode, tree type, int in_regs,
 		     int reg_parm_stack_space, int partial,
+		     cumulative_args_t ca,
 		     tree fndecl ATTRIBUTE_UNUSED,
 		     struct args_size *initial_offset_ptr,
 		     struct locate_and_pad_arg_data *locate)
@@ -4084,9 +4092,23 @@ locate_and_pad_parm (machine_mode passed_mode, tree type, int in_regs,
 	      ? arg_size_in_bytes (type)
 	      : size_int (GET_MODE_SIZE (passed_mode)));
   where_pad = targetm.calls.function_arg_padding (passed_mode, type);
-  boundary = targetm.calls.function_arg_boundary (passed_mode, type);
-  round_boundary = targetm.calls.function_arg_round_boundary (passed_mode,
-							      type);
+
+  if (flag_stack_use_cumulative_args)
+    {
+      boundary = targetm.calls.function_arg_boundary_ca (passed_mode,
+							 type,
+							 ca);
+      round_boundary = targetm.calls.function_arg_round_boundary_ca
+	(passed_mode, type, ca);
+    }
+  else
+    {
+      boundary = targetm.calls.function_arg_boundary (passed_mode,
+						      type);
+      round_boundary = targetm.calls.function_arg_round_boundary
+	(passed_mode, type);
+    }
+
   locate->where_pad = where_pad;
 
   /* Alignment can't exceed MAX_SUPPORTED_STACK_ALIGNMENT.  */
diff --git a/gcc/function.h b/gcc/function.h
index d7deaebee9c4..f5b70923e543 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -20,6 +20,7 @@ along with GCC; see the file COPYING3.  If not see
 #ifndef GCC_FUNCTION_H
 #define GCC_FUNCTION_H
 
+#include "cumulative-args.h"
 
 /* Stack of pending (incomplete) sequences saved by `start_sequence'.
    Each element describes one pending sequence.
@@ -665,6 +666,7 @@ extern int aggregate_value_p (const_tree, const_tree);
 extern bool use_register_for_decl (const_tree);
 extern gimple_seq gimplify_parameters (gimple_seq *);
 extern void locate_and_pad_parm (machine_mode, tree, int, int, int,
+				 cumulative_args_t,
 				 tree, struct args_size *,
 				 struct locate_and_pad_arg_data *);
 extern void generate_setjmp_warnings (void);
diff --git a/gcc/target.def b/gcc/target.def
index 082a7c62f34d..07aea21032f5 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4995,6 +4995,18 @@ with the specified mode and type.  The default hook returns\n\
  unsigned int, (machine_mode mode, const_tree type),
  default_function_arg_boundary)
 
+DEFHOOK
+(function_arg_boundary_ca,
+ "This is the @code{cumulative_args_t}-based version of\n\
+@code{TARGET_FUNCTION_ARG_BOUNDARY}. Define this hook if you need more\n\
+fine-grained control over argument alignment, e.g. depending on whether\n\
+it is a named argument or not, or any other criteria that you choose to\n\
+place in the @var{ca} structure.\n\
+\n\
+The default hook will call @code{TARGET_FUNCTION_ARG_BOUNDARY}.",
+ unsigned int, (machine_mode mode, const_tree type, cumulative_args_t ca),
+ default_function_arg_boundary_ca)
+
 DEFHOOK
 (function_arg_round_boundary,
  "Normally, the size of an argument is rounded up to @code{PARM_BOUNDARY},\n\
@@ -5004,6 +5016,18 @@ value.",
  unsigned int, (machine_mode mode, const_tree type),
  default_function_arg_round_boundary)
 
+DEFHOOK
+(function_arg_round_boundary_ca,
+ "This is the @code{cumulative_args_t}-based version of\n\
+@code{TARGET_FUNCTION_ARG_ROUND_BOUNDARY}. Define this hook if you need more\n\
+fine-grained control over argument size rounding, e.g. depending on whether\n\
+it is a named argument or not, or any other criteria that you choose to\n\
+place in the @var{ca} structure.\n\
+\n\
+The default hook will call @code{TARGET_FUNCTION_ARG_ROUND_BOUNDARY}.",
+ unsigned int, (machine_mode mode, const_tree type, cumulative_args_t ca),
+ default_function_arg_round_boundary_ca)
+
 /* Return the diagnostic message string if function without a prototype
    is not allowed for this 'val' argument; NULL otherwise. */
 DEFHOOK
diff --git a/gcc/target.h b/gcc/target.h
index c836036ac7fa..7f8f488e1543 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -52,22 +52,7 @@
 #include "tm.h"
 #include "hard-reg-set.h"
 #include "tree-core.h"
-
-#if CHECKING_P
-
-struct cumulative_args_t { void *magic; void *p; };
-
-#else /* !CHECKING_P */
-
-/* When using a GCC build compiler, we could use
-   __attribute__((transparent_union)) to get cumulative_args_t function
-   arguments passed like scalars where the ABI would mandate a less
-   efficient way of argument passing otherwise.  However, that would come
-   at the cost of less type-safe !CHECKING_P compilation.  */
-
-union cumulative_args_t { void *p; };
-
-#endif /* !CHECKING_P */
+#include "cumulative-args.h"
 
 /* Types of memory operation understood by the "by_pieces" infrastructure.
    Used by the TARGET_USE_BY_PIECES_INFRASTRUCTURE_P target hook and
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index d9e61552ad5a..ccca3ad5f427 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -851,6 +851,14 @@ default_function_arg_boundary (machine_mode mode ATTRIBUTE_UNUSED,
   return PARM_BOUNDARY;
 }
 
+unsigned int
+default_function_arg_boundary_ca (machine_mode mode ATTRIBUTE_UNUSED,
+				  const_tree type ATTRIBUTE_UNUSED,
+				  cumulative_args_t ca ATTRIBUTE_UNUSED)
+{
+  return default_function_arg_boundary (mode, type);
+}
+
 unsigned int
 default_function_arg_round_boundary (machine_mode mode ATTRIBUTE_UNUSED,
 				     const_tree type ATTRIBUTE_UNUSED)
@@ -858,6 +866,14 @@ default_function_arg_round_boundary (machine_mode mode ATTRIBUTE_UNUSED,
   return PARM_BOUNDARY;
 }
 
+unsigned int
+default_function_arg_round_boundary_ca (machine_mode mode ATTRIBUTE_UNUSED,
+					const_tree type ATTRIBUTE_UNUSED,
+					cumulative_args_t ca ATTRIBUTE_UNUSED)
+{
+  return default_function_arg_round_boundary (mode, type);
+}
+
 void
 hook_void_bitmap (bitmap regs ATTRIBUTE_UNUSED)
 {
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 3ca25ab6edb5..25810be98384 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -154,6 +154,12 @@ extern unsigned int default_function_arg_boundary (machine_mode,
 						   const_tree);
 extern unsigned int default_function_arg_round_boundary (machine_mode,
 							 const_tree);
+extern unsigned int default_function_arg_boundary_ca (machine_mode,
+						      const_tree,
+						      cumulative_args_t ca);
+extern unsigned int default_function_arg_round_boundary_ca (machine_mode,
+							    const_tree,
+							    cumulative_args_t ca);
 extern bool hook_bool_const_rtx_commutative_p (const_rtx, int);
 extern rtx default_function_value (const_tree, const_tree, bool);
 extern HARD_REG_SET default_zero_call_used_regs (HARD_REG_SET);
diff --git a/gcc/testsuite/gcc.target/aarch64/darwin/float128-00.c b/gcc/testsuite/gcc.target/aarch64/darwin/float128-00.c
new file mode 100644
index 000000000000..29aec80fbaa9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/darwin/float128-00.c
@@ -0,0 +1,38 @@
+
+/* we need this for _Float128.  */
+/* { dg-options "-std=gnu99 " } */
+/* We use the sections anchors to make the code easier to match.  */
+/* { dg-additional-options " -O2 -fsection-anchors " } */
+/* { dg-final { check-function-bodies "**" "" "" { target *-*-darwin* } } } */
+
+/* we should just pass q0 and q1 through
+**foo:
+**	...
+**	bl	___addtf3
+**	...
+*/
+
+__attribute__((__noinline__))
+_Float128
+foo (_Float128 a, _Float128 b)
+{
+  return a + b;
+}
+
+
+/* we should just load q0 and q1
+**call_foo:
+**	...
+**	ldr	q1, \[x[0-9]+\]
+**	...
+**	ldr	q0, \[x[0-9]+\]
+**	b	_foo
+**	...
+*/
+
+__attribute__((__noinline__))
+_Float128
+call_foo (void)
+{
+  return foo (1.0, 2.0);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/darwin/k+r-00.c b/gcc/testsuite/gcc.target/aarch64/darwin/k+r-00.c
new file mode 100644
index 000000000000..443fb9688115
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/darwin/k+r-00.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+
+/* { dg-options "-std=gnu99 " } */
+/* { dg-additional-options "-O2 -fsection-anchors" } */
+
+
+/* What we care about here is that we get int loads from sp, sp+4 and sp+8.
+ * This code will change when we implement darwinpcs d.3 - since the
+ * promotions will no longer be needed (although they are harmless).
+**test_k_r00:
+**	ldrsb	w[0-9]+, \[sp, 4\]
+**	ldr	x[0-9]+, \[sp, 8\]
+**	...
+**	ldrsb	w[0-9]+, \[sp\]
+**	...
+*/
+
+const char *
+test_k_r00 (r0, r1, r2, r3, r4, r5, r6, r7, a, b, c)
+     char r0, r1, r2, r3, r4, r5, r6, r7;
+     char a;
+     char b;
+     const char *c;
+{
+  if (a > 10 && b < 100)
+    return c;
+  return (char *)0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/darwin/variadic-01.c b/gcc/testsuite/gcc.target/aarch64/darwin/variadic-01.c
new file mode 100644
index 000000000000..c055aeae580e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/darwin/variadic-01.c
@@ -0,0 +1,102 @@
+/* { dg-do compile } */
+
+/* we need this for _Float128.  */
+/* { dg-options "-std=gnu99 " } */
+/* We use the sections anchors to make the code easier to match.  */
+/* { dg-additional-options " -O2 -fsection-anchors " } */
+/* { dg-final { check-function-bodies "**" "" "" { target *-*-darwin* } } } */
+
+#include <stdarg.h>
+
+/* What we care about here is that q0 and q1 are loaded from incoming sp and
+   sp+16.
+**foo:
+**	...
+**	ldr	q1, \[sp, 32\]
+**	ldr	q0, \[sp, 48\]
+**	...
+**	bl	___addtf3
+**	...
+*/
+
+__attribute__((__noinline__))
+_Float128
+foo (int n, ...)
+{
+  _Float128 a, b;
+  va_list ap;
+
+  va_start(ap, n);
+  a = va_arg(ap, _Float128);
+  b = va_arg(ap, _Float128);
+  va_end(ap); 
+  return a + b;
+}
+
+/*
+**call_foo:
+**	...
+**	str	q[0-9]+, \[sp, 16\]
+**	...
+**	mov	w0, 2
+**	str	q[0-9]+, \[sp\]
+**	bl	_foo
+**	...
+*/
+
+__attribute__((__noinline__))
+_Float128
+call_foo (void)
+{
+  return foo (2, (_Float128)1.0, (_Float128)2.0);
+}
+
+/* What we care about here is that q0 and q1 are loaded from incoming sp and
+   sp+32 (with the int at sp+16).
+**bar:
+**	...
+**	ldr	w[0-9]+, \[x[0-9]+, 16\]
+**	ldr	q0, \[x[0-9]+\]
+**	...
+**	ldr	q1, \[x[0-9]+, 32\]
+**	bl	___addtf3
+**	...
+*/
+
+__attribute__((__noinline__))
+_Float128
+bar (int n, ...)
+{
+  _Float128 a, b;
+  va_list ap;
+
+  va_start(ap, n);
+  a = va_arg(ap, _Float128);
+  n = va_arg(ap, int);
+  if (n != 42)
+    __builtin_abort ();
+  b = va_arg(ap, _Float128);
+  va_end(ap); 
+  return a + b;
+}
+
+/*
+**call_bar:
+**	...
+**	str	q[0-9]+, \[sp, 32\]
+**	...
+**	mov	w[0-9]+, 42
+**	str	w[0-9]+, \[sp, 16\]
+**	mov	w0, 2
+**	str	q[0-9]+, \[sp\]
+**	bl	_bar
+**	...
+*/
+
+__attribute__((__noinline__))
+_Float128
+call_bar (void)
+{
+  return bar (2, (_Float128)1.0,
+	      42, (_Float128)2.0);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/darwin/variadic-02.c b/gcc/testsuite/gcc.target/aarch64/darwin/variadic-02.c
new file mode 100644
index 000000000000..9d796bfc07f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/darwin/variadic-02.c
@@ -0,0 +1,104 @@
+/* { dg-do compile } */
+
+/* we need this for __int128.  */
+/* { dg-options "-std=gnu99 " } */
+/* We use the sections anchors to make the code easier to match.  */
+/* { dg-additional-options " -O2 -fsection-anchors " } */
+/* { dg-final { check-function-bodies "**" "" "" { target *-*-darwin* } } } */
+
+#include <stdarg.h>
+
+/* What we care about here is that we load the values from incoming sp and
+   sp + 16.
+**foo:
+**	sub	sp, sp, #16
+**	...
+**	ldp	x[0-9]+, x[0-9]+, \[sp, 16\]
+**	...
+**	ldr	x[0-9]+, \[sp, 32\]
+**	ldr	x[0-9]+, \[sp, 40\]
+**	...
+*/
+
+__attribute__((__noinline__))
+__int128
+foo (int n, ...)
+{
+  __int128 a, b;
+  va_list ap;
+
+  va_start(ap, n);
+  a = va_arg(ap, __int128);
+  b = va_arg(ap, __int128);
+  va_end(ap); 
+  return a + b;
+}
+
+/*
+**call_foo:
+**	...
+**	stp	x[0-9]+, x[0-9]+, \[sp\]
+**	mov	w0, 2
+**	stp	x[0-9]+, x[0-9]+, \[sp, 16\]
+**	bl	_foo
+**	...
+*/
+
+__attribute__((__noinline__))
+__int128
+call_foo (void)
+{
+  return foo (2, (__int128)1, (__int128)2);
+}
+
+
+/* sp = one int128, sp+16 = int sp + 32 = other int128 
+**bar:
+**	...
+**	sub	sp, sp, #16
+**	...
+**	ldp	x[0-9]+, x[0-9]+, \[sp, 16\]
+**	...
+**	ldr	x[0-9]+, \[sp, 48\]
+**	ldr	x[0-9]+, \[sp, 56\]
+**	...
+*/
+
+__attribute__((__noinline__))
+__int128
+bar (int n, ...)
+{
+  __int128 a, b;
+  va_list ap;
+
+  va_start(ap, n);
+  a = va_arg(ap, __int128);
+  n = va_arg(ap, int);
+  b = va_arg(ap, __int128);
+  va_end(ap); 
+  return a + b;
+}
+
+__attribute__((__noinline__))
+__int128
+baro (int n, ...);
+
+/*
+**call_bar:
+**	...
+**	mov	w[0-9]+, 42
+**	...
+**	mov	w0, 2
+**	stp	x[0-9]+, x[0-9]+, \[sp\]
+**	str	w[0-9]+, \[sp, 16\]
+**	stp	x[0-9]+, x[0-9]+, \[sp, 32\]
+**	bl	_baro
+**	...
+*/
+
+__attribute__((__noinline__))
+__int128
+call_bar (void)
+{
+  return baro (2, (__int128)1, 42, (__int128)2);
+}