1 files changed, 0 insertions, 2198 deletions
diff --git a/ptx/src/ptx.lalrpop b/ptx/src/ptx.lalrpop
deleted file mode 100644
index e3a4022..0000000
--- a/ptx/src/ptx.lalrpop
+++ /dev/null
@@ -1,2198 +0,0 @@
-use crate::ast;
-use crate::ast::UnwrapWithVec;
-use crate::{without_none, vector_index};
-
-use lalrpop_util::ParseError;
-use std::convert::TryInto;
-
-grammar<'err>(errors: &'err mut Vec<ParseError<usize, Token<'input>, ast::PtxError>>);
-
-extern {
-    type Error = ast::PtxError;
-}
-
-match {
-    r"\s+" => { },
-    r"//[^\n\r]*[\n\r]*" => { },
-    r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/" => { },
-    r"0[fF][0-9a-zA-Z]{8}" => F32NumToken,
-    r"0[dD][0-9a-zA-Z]{16}" => F64NumToken,
-    r"0[xX][0-9a-zA-Z]+U?" => HexNumToken,
-    r"[0-9]+U?" => DecimalNumToken,
-    r#""[^"]*""# => String,
-    r"[0-9]+\.[0-9]+" => VersionNumber,
-    "!",
-    "(", ")",
-    "+",
-    "-",
-    ",",
-    ".",
-    ":",
-    ";",
-    "@",
-    "[", "]",
-    "{", "}",
-    "<", ">",
-    "|",
-    "=",
-    ".acq_rel",
-    ".acquire",
-    ".add",
-    ".address_size",
-    ".align",
-    ".aligned",
-    ".and",
-    ".approx",
-    ".b16",
-    ".b32",
-    ".b64",
-    ".b8",
-    ".ca",
-    ".cas",
-    ".cg",
-    ".const",
-    ".cs",
-    ".cta",
-    ".cv",
-    ".dec",
-    ".entry",
-    ".eq",
-    ".equ",
-    ".exch",
-    ".extern",
-    ".f16",
-    ".f16x2",
-    ".f32",
-    ".f64",
-    ".file",
-    ".ftz",
-    ".full",
-    ".func",
-    ".ge",
-    ".geu",
-    ".gl",
-    ".global",
-    ".gpu",
-    ".gt",
-    ".gtu",
-    ".hi",
-    ".hs",
-    ".inc",
-    ".le",
-    ".leu",
-    ".lo",
-    ".loc",
-    ".local",
-    ".ls",
-    ".lt",
-    ".ltu",
-    ".lu",
-    ".max",
-    ".maxnreg",
-    ".maxntid",
-    ".minnctapersm",
-    ".min",
-    ".nan",
-    ".NaN",
-    ".nc",
-    ".ne",
-    ".neu",
-    ".num",
-    ".or",
-    ".param",
-    ".pragma",
-    ".pred",
-    ".reg",
-    ".relaxed",
-    ".release",
-    ".reqntid",
-    ".rm",
-    ".rmi",
-    ".rn",
-    ".rni",
-    ".rp",
-    ".rpi",
-    ".rz",
-    ".rzi",
-    ".s16",
-    ".s32",
-    ".s64",
-    ".s8" ,
-    ".sat",
-    ".section",
-    ".shared",
-    ".sync",
-    ".sys",
-    ".target",
-    ".to",
-    ".u16",
-    ".u32",
-    ".u64",
-    ".u8" ,
-    ".uni",
-    ".v2",
-    ".v4",
-    ".version",
-    ".visible",
-    ".volatile",
-    ".wb",
-    ".weak",
-    ".wide",
-    ".wt",
-    ".xor",
-} else {
-    // IF YOU ARE ADDING A NEW TOKEN HERE ALSO ADD IT BELOW TO ExtendedID
-    "abs",
-    "activemask",
-    "add",
-    "and",
-    "atom",
-    "bar",
-    "barrier",
-    "bfe",
-    "bfi",
-    "bra",
-    "brev",
-    "call",
-    "clz",
-    "cos",
-    "cvt",
-    "cvta",
-    "debug",
-    "div",
-    "ex2",
-    "fma",
-    "ld",
-    "lg2",
-    "mad",
-    "map_f64_to_f32",
-    "max",
-    "membar",
-    "min",
-    "mov",
-    "mul",
-    "neg",
-    "not",
-    "or",
-    "popc",
-    "prmt",
-    "rcp",
-    "rem",
-    "ret",
-    "rsqrt",
-    "selp",
-    "setp",
-    "shl",
-    "shr",
-    "sin",
-    r"sm_[0-9]+" => ShaderModel,
-    "sqrt",
-    "st",
-    "sub",
-    "texmode_independent",
-    "texmode_unified",
-    "xor",
-} else {
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#identifiers
-    r"[a-zA-Z][a-zA-Z0-9_$]*|[_$%][a-zA-Z0-9_$]+" => ID,
-    r"\.[a-zA-Z][a-zA-Z0-9_$]*" => DotID,
-}
-
-ExtendedID : &'input str = {
-    "abs",
-    "activemask",
-    "add",
-    "and",
-    "atom",
-    "bar",
-    "barrier",
-    "bfe",
-    "bfi",
-    "bra",
-    "brev",
-    "call",
-    "clz",
-    "cos",
-    "cvt",
-    "cvta",
-    "debug",
-    "div",
-    "ex2",
-    "fma",
-    "ld",
-    "lg2",
-    "mad",
-    "map_f64_to_f32",
-    "max",
-    "membar",
-    "min",
-    "mov",
-    "mul",
-    "neg",
-    "not",
-    "or",
-    "popc",
-    "prmt",
-    "rcp",
-    "rem",
-    "ret",
-    "rsqrt",
-    "selp",
-    "setp",
-    "shl",
-    "shr",
-    "sin",
-    ShaderModel,
-    "sqrt",
-    "st",
-    "sub",
-    "texmode_independent",
-    "texmode_unified",
-    "xor",
-    ID
-}
-
-NumToken: (&'input str, u32, bool) = {
-    <s:HexNumToken> => {
-        if s.ends_with('U') {
-            (&s[2..s.len() - 1], 16, true)
-        } else {
-            (&s[2..], 16, false)
-        }
-    },
-    <s:DecimalNumToken> => {
-        let radix = if s.starts_with('0') { 8 } else { 10 };
-        if s.ends_with('U') {
-            (&s[..s.len() - 1], radix, true)
-        } else {
-            (s, radix, false)
-        }
-    }
-}
-
-F32Num: f32 = {
-    <s:F32NumToken> => {
-        match u32::from_str_radix(&s[2..], 16) {
-            Ok(x) => unsafe { std::mem::transmute::<_, f32>(x) },
-            Err(err) => {
-                errors.push(ParseError::User { error: ast::PtxError::from(err) });
-                0.0
-            }
-        }
-        
-    }
-}
-
-F64Num: f64 = {
-    <s:F64NumToken> => {
-        match u64::from_str_radix(&s[2..], 16) {
-            Ok(x) => unsafe { std::mem::transmute::<_, f64>(x) },
-            Err(err) => {
-                errors.push(ParseError::User { error: ast::PtxError::from(err) });
-                0.0
-            }
-        }
-    }
-}
-
-U8Num: u8 = {
-    <x:NumToken> => {
-        let (text, radix, _) = x;
-        match u8::from_str_radix(text, radix) {
-            Ok(x) => x,
-            Err(err) => {
-                errors.push(ParseError::User { error: ast::PtxError::from(err) });
-                0
-            }
-        }
-    }
-}
-
-U16Num: u16 = {
-    <x:NumToken> => {
-        let (text, radix, _) = x;
-        match u16::from_str_radix(text, radix) {
-            Ok(x) => x,
-            Err(err) => {
-                errors.push(ParseError::User { error: ast::PtxError::from(err) });
-                0
-            }
-        }
-    }
-}
-
-U32Num: u32 = {
-    <x:NumToken> => {
-        let (text, radix, _) = x;
-        match u32::from_str_radix(text, radix) {
-            Ok(x) => x,
-            Err(err) => {
-                errors.push(ParseError::User { error: ast::PtxError::from(err) });
-                0
-            }
-        }
-    }
-}
-
-// TODO: handle negative number properly
-S32Num: i32 = {
-    <sign:"-"?> <x:NumToken> => {
-        let (text, radix, _) = x;
-        match i32::from_str_radix(text, radix) {
-            Ok(x) => if sign.is_some() { -x } else { x },
-            Err(err) => {
-                errors.push(ParseError::User { error: ast::PtxError::from(err) });
-                0
-            }
-        }
-    }
-}
-
-pub Module: ast::Module<'input> = {
-    <v:Version> Target <d:Directive*> => {
-        ast::Module { version: v, directives: without_none(d) }
-    }
-};
-
-Version: (u8, u8) = {
-    ".version" <v:VersionNumber> => {
-        let dot = v.find('.').unwrap();
-        let major = v[..dot].parse::<u8>().unwrap_or_else(|err| {
-            errors.push(ParseError::User { error: ast::PtxError::from(err) });
-            0
-        });
-        let minor = v[dot+1..].parse::<u8>().unwrap_or_else(|err| {
-            errors.push(ParseError::User { error: ast::PtxError::from(err) });
-            0
-        });
-        (major,minor)
-    }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-target
-Target = {
-    ".target" Comma<TargetSpecifier>
-};
-
-TargetSpecifier = {
-    ShaderModel,
-    "texmode_unified",
-    "texmode_independent",
-    "debug",
-    "map_f64_to_f32"
-};
-
-Directive: Option<ast::Directive<'input, ast::ParsedArgParams<'input>>> = {
-    AddressSize => None,
-    <f:Function> => {
-        let (linking, func) = f;
-        Some(ast::Directive::Method(linking, func))
-    },
-    File => None,
-    Section => None,
-    <v:ModuleVariable> ";" => {
-        let (linking, var) = v;
-        Some(ast::Directive::Variable(linking, var))
-    },
-    @L ! @R => {
-        let (start, _, end)= (<>);
-        errors.push(ParseError::User { error: 
-            ast::PtxError::UnrecognizedDirective { start, end }
-        });
-        None
-    }
-};
-
-AddressSize = {
-    ".address_size" U8Num
-};
-
-Function: (ast::LinkingDirective, ast::Function<'input, &'input str, ast::Statement<ast::ParsedArgParams<'input>>>) = {
-    <linking:LinkingDirectives>
-    <func_directive:MethodDeclaration>
-    <tuning:TuningDirective*>
-    <body:FunctionBody> => {
-        (linking, ast::Function{func_directive, tuning, body})
-    }
-};
- 
-LinkingDirective: ast::LinkingDirective = {
-    ".extern" => ast::LinkingDirective::EXTERN,
-    ".visible" => ast::LinkingDirective::VISIBLE,
-    ".weak" => ast::LinkingDirective::WEAK,
-};
-
-TuningDirective: ast::TuningDirective = {
-    ".maxnreg" <ncta:U32Num> => ast::TuningDirective::MaxNReg(ncta),
-    ".maxntid" <nx:U32Num> => ast::TuningDirective::MaxNtid(nx, 1, 1),
-    ".maxntid" <nx:U32Num> "," <ny:U32Num> => ast::TuningDirective::MaxNtid(nx, ny, 1),
-    ".maxntid" <nx:U32Num> "," <ny:U32Num> "," <nz:U32Num> => ast::TuningDirective::MaxNtid(nx, ny, nz),
-    ".reqntid" <nx:U32Num> => ast::TuningDirective::ReqNtid(nx, 1, 1),
-    ".reqntid" <nx:U32Num> "," <ny:U32Num> => ast::TuningDirective::ReqNtid(nx, ny, 1),
-    ".reqntid" <nx:U32Num> "," <ny:U32Num> "," <nz:U32Num> => ast::TuningDirective::ReqNtid(nx, ny, nz),
-    ".minnctapersm" <ncta:U32Num> => ast::TuningDirective::MinNCtaPerSm(ncta),
-};
-
-LinkingDirectives: ast::LinkingDirective = {
-    <ldirs:LinkingDirective*> => {
-        ldirs.into_iter().fold(ast::LinkingDirective::NONE, |x, y| x | y)
-    }
-}
-
-MethodDeclaration: ast::MethodDeclaration<'input, &'input str> = {
-    ".entry" <name:ExtendedID> <input_arguments:KernelArguments> => {
-        let return_arguments = Vec::new();
-        let name = ast::MethodName::Kernel(name);
-        ast::MethodDeclaration{ return_arguments, name, input_arguments, shared_mem: None }
-    },
-    ".func" <return_arguments:FnArguments?> <name:ExtendedID> <input_arguments:FnArguments> => {
-        let return_arguments = return_arguments.unwrap_or_else(|| Vec::new());
-        let name = ast::MethodName::Func(name);
-        ast::MethodDeclaration{ return_arguments, name, input_arguments, shared_mem: None }
-    }
-};
-
-KernelArguments: Vec<ast::Variable<&'input str>> = {
-    "(" <args:Comma<KernelInput>> ")" => args
-};
-
-FnArguments: Vec<ast::Variable<&'input str>> = {
-    "(" <args:Comma<FnInput>> ")" => args
-};
-
-KernelInput: ast::Variable<&'input str> = {
-    <v:ParamDeclaration> => {
-        let (align, v_type, name) = v;
-        ast::Variable {
-            align,
-            v_type,
-            state_space: ast::StateSpace::Param,
-            name,
-            array_init: Vec::new()
-        }
-    }
-}
-
-FnInput: ast::Variable<&'input str> = {
-    <v:RegVariable> => {
-        let (align, v_type, name) = v;
-        let state_space = ast::StateSpace::Reg;
-        ast::Variable{ align, v_type, state_space, name, array_init: Vec::new() }
-    },
-    <v:ParamDeclaration> => {
-        let (align, v_type, name) = v;
-        let state_space = ast::StateSpace::Param;
-        ast::Variable{ align, v_type, state_space, name, array_init: Vec::new() }
-    }
-}
-
-FunctionBody: Option<Vec<ast::Statement<ast::ParsedArgParams<'input>>>> = {
-    "{" <s:Statement*> "}" => { Some(without_none(s)) },
-    ";" => { None }
-};
-
-StateSpaceSpecifier: ast::StateSpace = {
-    ".reg" => ast::StateSpace::Reg,
-    ".const" => ast::StateSpace::Const,
-    ".global" => ast::StateSpace::Global,
-    ".local" => ast::StateSpace::Local,
-    ".shared" => ast::StateSpace::Shared,
-    ".param" => ast::StateSpace::Param, // used to prepare function call
-};
-
-#[inline]
-ScalarType: ast::ScalarType = {
-    ".f16" => ast::ScalarType::F16,
-    ".f16x2" => ast::ScalarType::F16x2,
-    ".pred" => ast::ScalarType::Pred,
-    ".b8" => ast::ScalarType::B8,
-    ".b16" => ast::ScalarType::B16,
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-    ".u8" => ast::ScalarType::U8,
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s8" => ast::ScalarType::S8,
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-    ".f32" => ast::ScalarType::F32,
-    ".f64" => ast::ScalarType::F64,
-};
-
-Statement: Option<ast::Statement<ast::ParsedArgParams<'input>>> = {
-    <l:Label> => Some(ast::Statement::Label(l)),
-    DebugDirective => None,
-    <v:MultiVariable> ";" => Some(ast::Statement::Variable(v)),
-    <p:PredAt?> <i:Instruction> ";" => Some(ast::Statement::Instruction(p, i)),
-    PragmaStatement => None,
-    "{" <s:Statement*> "}" => Some(ast::Statement::Block(without_none(s))),
-    @L ! ";" @R => {
-        let (start, _, _, end) = (<>);
-        errors.push(ParseError::User { error: 
-            ast::PtxError::UnrecognizedStatement { start, end }
-        });
-        None
-    }
-};
-
-PragmaStatement: () = {
-    ".pragma" String  ";"
-}
-
-DebugDirective: () = {
-    DebugLocation
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-loc
-DebugLocation = {
-    ".loc" U32Num U32Num U32Num
-};
-
-Label: &'input str = {
-    <id:ExtendedID> ":" => id
-};
-
-Align: u32 = {
-    ".align" <x:U32Num> => x
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parameterized-variable-names
-MultiVariable: ast::MultiVariable<&'input str> = {
-    <var:Variable> <count:VariableParam?> => ast::MultiVariable{<>}
-}
-
-VariableParam: u32 = {
-    "<" <n:U32Num> ">" => n
-}
-
-Variable: ast::Variable<&'input str> = {
-    <v:RegVariable> => {
-        let (align, v_type, name) = v;
-        let state_space = ast::StateSpace::Reg;
-        ast::Variable {align, v_type, state_space, name, array_init: Vec::new()}
-    },
-    LocalVariable,
-    <v:ParamVariable> => {
-        let (align, array_init, v_type, name) = v;
-        let state_space = ast::StateSpace::Param;
-        ast::Variable {align, v_type, state_space, name, array_init}
-    },
-    SharedVariable,
-};
-
-RegVariable: (Option<u32>, ast::Type, &'input str) = {
-    ".reg" <var:VariableScalar<ScalarType>> => {
-        let (align, t, name) = var;
-        let v_type = ast::Type::Scalar(t);
-        (align, v_type, name)
-    },
-    ".reg" <var:VariableVector<SizedScalarType>> => {
-        let (align, v_len, t, name) = var;
-        let v_type = ast::Type::Vector(t, v_len);
-        (align, v_type, name)
-    }
-}
-
-LocalVariable: ast::Variable<&'input str> = {
-    ".local" <var:VariableScalar<SizedScalarType>> => {
-        let (align, t, name) = var;
-        let v_type = ast::Type::Scalar(t);
-        let state_space = ast::StateSpace::Local;
-        ast::Variable { align, v_type, state_space, name, array_init: Vec::new() }
-    },
-    ".local" <var:VariableVector<SizedScalarType>> => {
-        let (align, v_len, t, name) = var;
-        let v_type = ast::Type::Vector(t, v_len);
-        let state_space = ast::StateSpace::Local;
-        ast::Variable { align, v_type, state_space, name, array_init: Vec::new() }
-    },
-    ".local" <var:VariableArrayOrPointer<SizedScalarType>> => {
-        let (align, t, name, arr_or_ptr) = var;
-        let state_space = ast::StateSpace::Local;
-        let (v_type, array_init) = match arr_or_ptr {
-            ast::ArrayOrPointer::Array { dimensions, init } => {
-                (ast::Type::Array(t, dimensions), init)
-            }
-            ast::ArrayOrPointer::Pointer => {
-                errors.push(ParseError::User { error: ast::PtxError::ZeroDimensionArray });
-                (ast::Type::Array(t, Vec::new()), Vec::new())
-            }
-        };
-        ast::Variable { align, v_type, state_space, name, array_init }
-    }
-}
-
-SharedVariable: ast::Variable<&'input str> = {
-    ".shared" <var:VariableScalar<SizedScalarType>> => {
-        let (align, t, name) = var;
-        let state_space = ast::StateSpace::Shared;
-        let v_type = ast::Type::Scalar(t);
-        ast::Variable { align, v_type, state_space, name, array_init: Vec::new() }
-    },
-    ".shared" <var:VariableVector<SizedScalarType>> => {
-        let (align, v_len, t, name) = var;
-        let state_space = ast::StateSpace::Shared;
-        let v_type = ast::Type::Vector(t, v_len);
-        ast::Variable { align, v_type, state_space, name, array_init: Vec::new() }
-    },
-    ".shared" <var:VariableArrayOrPointer<SizedScalarType>> => {
-        let (align, t, name, arr_or_ptr) = var;
-        let state_space = ast::StateSpace::Shared;
-        let (v_type, array_init) = match arr_or_ptr {
-            ast::ArrayOrPointer::Array { dimensions, init } => {
-                (ast::Type::Array(t, dimensions), init)
-            }
-            ast::ArrayOrPointer::Pointer => {
-                errors.push(ParseError::User { error: ast::PtxError::ZeroDimensionArray });
-                (ast::Type::Array(t, Vec::new()), Vec::new())
-            }
-        };
-        ast::Variable { align, v_type, state_space, name, array_init }
-    }
-}
-
-ModuleVariable: (ast::LinkingDirective, ast::Variable<&'input str>) = {
-    <linking:LinkingDirectives> <state_space:VariableStateSpace> <def:GlobalVariableDefinitionNoArray> => {
-        let (align, v_type, name, array_init) = def;
-        (linking, ast::Variable { align, v_type, state_space, name, array_init })
-    },
-    <linking:LinkingDirectives> <space:VariableStateSpace> <var:VariableArrayOrPointer<SizedScalarType>> => {
-        let (align, t, name, arr_or_ptr) = var;
-        let (v_type, state_space, array_init) = match arr_or_ptr {
-            ast::ArrayOrPointer::Array { dimensions, init } => {
-                (ast::Type::Array(t, dimensions), space, init)
-            }
-            ast::ArrayOrPointer::Pointer => {
-                if !linking.contains(ast::LinkingDirective::EXTERN) {
-                    errors.push(ParseError::User { error: ast::PtxError::NonExternPointer });
-                }
-                (ast::Type::Array(t, Vec::new()), space, Vec::new())
-            }
-        };
-        (linking, ast::Variable{ align, v_type, state_space, name, array_init })
-    }
-}
-
-VariableStateSpace: ast::StateSpace = {
-    ".const" => ast::StateSpace::Const,
-    ".global" => ast::StateSpace::Global,
-    ".shared" => ast::StateSpace::Shared,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parameter-state-space
-ParamVariable: (Option<u32>, Vec<u8>, ast::Type, &'input str) = {
-    ".param" <var:VariableScalar<LdStScalarType>> => {
-        let (align, t, name) = var;
-        let v_type = ast::Type::Scalar(t);
-        (align, Vec::new(), v_type, name)
-    },
-    ".param" <var:VariableArrayOrPointer<SizedScalarType>> => {
-        let (align, t, name, arr_or_ptr) = var;
-        let (v_type, array_init) = match arr_or_ptr {
-            ast::ArrayOrPointer::Array { dimensions, init } => {
-                (ast::Type::Array(t, dimensions), init)
-            }
-            ast::ArrayOrPointer::Pointer => {
-                (ast::Type::Scalar(t), Vec::new())
-            }
-        };
-        (align, array_init, v_type, name)
-    }
-}
-
-ParamDeclaration: (Option<u32>, ast::Type, &'input str) = {
-    <var:ParamVariable> => {
-        let (align, array_init, v_type, name) = var;
-        if array_init.len() > 0 {
-            errors.push(ParseError::User { error: ast::PtxError::ArrayInitalizer });
-        }
-        (align, v_type, name)
-    }
-}
-
-GlobalVariableDefinitionNoArray: (Option<u32>, ast::Type, &'input str, Vec<u8>) = {
-    <scalar:VariableScalar<SizedScalarType>> => {
-        let (align, t, name) = scalar;
-        let v_type = ast::Type::Scalar(t);
-        (align, v_type, name, Vec::new())
-    },
-    <var:VariableVector<SizedScalarType>> => {
-        let (align, v_len, t, name) = var;
-        let v_type = ast::Type::Vector(t, v_len);
-        (align, v_type, name, Vec::new())
-    },
-}
-
-#[inline]
-SizedScalarType: ast::ScalarType = {
-    ".b8" => ast::ScalarType::B8,
-    ".b16" => ast::ScalarType::B16,
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-    ".u8" => ast::ScalarType::U8,
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s8" => ast::ScalarType::S8,
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-    ".f16" => ast::ScalarType::F16,
-    ".f16x2" => ast::ScalarType::F16x2,
-    ".f32" => ast::ScalarType::F32,
-    ".f64" => ast::ScalarType::F64,
-}
-
-#[inline]
-LdStScalarType: ast::ScalarType = {
-    ".b8" => ast::ScalarType::B8,
-    ".b16" => ast::ScalarType::B16,
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-    ".u8" => ast::ScalarType::U8,
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s8" => ast::ScalarType::S8,
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-    ".f16" => ast::ScalarType::F16,
-    ".f32" => ast::ScalarType::F32,
-    ".f64" => ast::ScalarType::F64,
-}
-
-Instruction: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    InstLd,
-    InstMov,
-    InstMul,
-    InstAdd,
-    InstSetp,
-    InstNot,
-    InstBra,
-    InstCvt,
-    InstShl,
-    InstShr,
-    InstSt,
-    InstRet,
-    InstCvta,
-    InstCall,
-    InstAbs,
-    InstMad,
-    InstFma,
-    InstOr,
-    InstAnd,
-    InstSub,
-    InstMin,
-    InstMax,
-    InstRcp,
-    InstSelp,
-    InstBar,
-    InstAtom,
-    InstAtomCas,
-    InstDiv,
-    InstSqrt,
-    InstRsqrt,
-    InstNeg,
-    InstSin,
-    InstCos,
-    InstLg2,
-    InstEx2,
-    InstClz,
-    InstBrev,
-    InstPopc,
-    InstXor,
-    InstRem,
-    InstBfe,
-    InstBfi,
-    InstPrmt,
-    InstActivemask,
-    InstMembar,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
-InstLd: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "ld" <q:LdStQualifier?> <ss:LdNonGlobalStateSpace?> <cop:LdCacheOperator?> <t:LdStType> <dst:DstOperandVec> "," <src:MemoryOperand> => {
-        ast::Instruction::Ld(
-            ast::LdDetails {
-                qualifier: q.unwrap_or(ast::LdStQualifier::Weak),
-                state_space: ss.unwrap_or(ast::StateSpace::Generic),
-                caching: cop.unwrap_or(ast::LdCacheOperator::Cached),
-                typ: t,
-                non_coherent: false
-            },
-            ast::Arg2Ld { dst:dst, src:src }
-        )
-    },
-    "ld" <q:LdStQualifier?> ".global" <cop:LdCacheOperator?> <t:LdStType> <dst:DstOperandVec> "," <src:MemoryOperand> => {
-        ast::Instruction::Ld(
-            ast::LdDetails {
-                qualifier: q.unwrap_or(ast::LdStQualifier::Weak),
-                state_space: ast::StateSpace::Global,
-                caching: cop.unwrap_or(ast::LdCacheOperator::Cached),
-                typ: t,
-                non_coherent: false
-            },
-            ast::Arg2Ld { dst:dst, src:src }
-        )
-    },
-    "ld" ".global" <cop:LdNcCacheOperator?> ".nc" <t:LdStType> <dst:DstOperandVec> "," <src:MemoryOperand> => {
-        ast::Instruction::Ld(
-            ast::LdDetails {
-                qualifier: ast::LdStQualifier::Weak,
-                state_space: ast::StateSpace::Global,
-                caching: cop.unwrap_or(ast::LdCacheOperator::Cached),
-                typ: t,
-                non_coherent: true
-            },
-            ast::Arg2Ld { dst:dst, src:src }
-        )
-    }
-};
-
-LdStType: ast::Type = {
-    <v:VectorPrefix> <t:LdStScalarType> => ast::Type::Vector(t, v),
-    <t:LdStScalarType> => ast::Type::Scalar(t),
-}
-
-LdStQualifier: ast::LdStQualifier = {
-    ".weak" => ast::LdStQualifier::Weak,
-    ".volatile" => ast::LdStQualifier::Volatile,
-    ".relaxed" <s:MemScope> => ast::LdStQualifier::Relaxed(s),
-    ".acquire" <s:MemScope> => ast::LdStQualifier::Acquire(s),
-};
-
-MemScope: ast::MemScope = {
-    ".cta" => ast::MemScope::Cta,
-    ".gpu" => ast::MemScope::Gpu,
-    ".sys" => ast::MemScope::Sys
-};
-
-MembarLevel: ast::MemScope = {
-    ".cta" => ast::MemScope::Cta,
-    ".gl" => ast::MemScope::Gpu,
-    ".sys" => ast::MemScope::Sys
-};
-
-LdNonGlobalStateSpace: ast::StateSpace = {
-    ".const" => ast::StateSpace::Const,
-    ".local" => ast::StateSpace::Local,
-    ".param" => ast::StateSpace::Param,
-    ".shared" => ast::StateSpace::Shared,
-};
-
-LdCacheOperator: ast::LdCacheOperator = {
-    ".ca" => ast::LdCacheOperator::Cached,
-    ".cg" => ast::LdCacheOperator::L2Only,
-    ".cs" => ast::LdCacheOperator::Streaming,
-    ".lu" => ast::LdCacheOperator::LastUse,
-    ".cv" => ast::LdCacheOperator::Uncached,
-};
-
-LdNcCacheOperator: ast::LdCacheOperator = {
-    ".ca" => ast::LdCacheOperator::Cached,
-    ".cg" => ast::LdCacheOperator::L2Only,
-    ".cs" => ast::LdCacheOperator::Streaming,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
-InstMov: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "mov" <pref:VectorPrefix?> <t:MovScalarType> <dst:DstOperandVec> "," <src:SrcOperandVec> => {
-        let mov_type = match pref {
-            Some(vec_width) => ast::Type::Vector(t, vec_width),
-            None => ast::Type::Scalar(t)
-        };
-        let details = ast::MovDetails::new(mov_type);
-        ast::Instruction::Mov(
-            details,
-            ast::Arg2Mov { dst, src }
-        )
-    }
-}
-
-#[inline]
-MovScalarType: ast::ScalarType = {
-    ".b16" => ast::ScalarType::B16,
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-    ".f32" => ast::ScalarType::F32,
-    ".f64" => ast::ScalarType::F64,
-    ".pred" => ast::ScalarType::Pred
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mul
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul
-InstMul: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "mul" <d:MulDetails> <a:Arg3> => ast::Instruction::Mul(d, a)
-};
-
-MulDetails: ast::MulDetails = {
-    <ctr:MulIntControl> <t:UIntType> => ast::MulDetails::Unsigned(ast::MulUInt{
-        typ: t,
-        control: ctr
-    }),
-    <ctr:MulIntControl> <t:SIntType> => ast::MulDetails::Signed(ast::MulSInt{
-        typ: t,
-        control: ctr
-    }),
-    <f:ArithFloat> => ast::MulDetails::Float(f)
-};
-
-MulIntControl: ast::MulIntControl = {
-    ".hi" => ast::MulIntControl::High,
-    ".lo" => ast::MulIntControl::Low,
-    ".wide" => ast::MulIntControl::Wide
-};
-
-#[inline]
-RoundingModeFloat : ast::RoundingMode = {
-    ".rn" => ast::RoundingMode::NearestEven,
-    ".rz" => ast::RoundingMode::Zero,
-    ".rm" => ast::RoundingMode::NegativeInf,
-    ".rp" => ast::RoundingMode::PositiveInf,
-};
-
-RoundingModeInt : ast::RoundingMode = {
-    ".rni" => ast::RoundingMode::NearestEven,
-    ".rzi" => ast::RoundingMode::Zero,
-    ".rmi" => ast::RoundingMode::NegativeInf,
-    ".rpi" => ast::RoundingMode::PositiveInf,
-};
-
-IntType : ast::ScalarType = {
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-};
-
-IntType3264: ast::ScalarType = {
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-}
-
-UIntType: ast::ScalarType = {
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-};
-
-SIntType: ast::ScalarType = {
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-};
-
-FloatType: ast::ScalarType = {
-    ".f16" => ast::ScalarType::F16,
-    ".f16x2" => ast::ScalarType::F16x2,
-    ".f32" => ast::ScalarType::F32,
-    ".f64" => ast::ScalarType::F64,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-add
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
-InstAdd: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "add" <d:ArithDetails> <a:Arg3> => ast::Instruction::Add(d, a)
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp
-// TODO: support f16 setp
-InstSetp: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "setp" <d:SetpMode> <a:Arg4Setp> => ast::Instruction::Setp(d, a),
-    "setp" <d:SetpBoolMode> <a:Arg5Setp> => ast::Instruction::SetpBool(d, a),
-};
-
-SetpMode: ast::SetpData = {
-    <cmp_op:SetpCompareOp> <t:SetpTypeNoF32> => ast::SetpData {
-        typ: t,
-        flush_to_zero: None,
-        cmp_op: cmp_op,
-    },
-    <cmp_op:SetpCompareOp> <ftz:".ftz"?> ".f32" => ast::SetpData {
-        typ: ast::ScalarType::F32,
-        flush_to_zero: Some(ftz.is_some()),
-        cmp_op: cmp_op,
-    }
-
-};
-
-SetpBoolMode: ast::SetpBoolData = {
-    <cmp_op:SetpCompareOp> <bool_op:SetpBoolPostOp> <t:SetpTypeNoF32> => ast::SetpBoolData {
-        typ: t,
-        flush_to_zero: None,
-        cmp_op: cmp_op,
-        bool_op: bool_op,
-    },
-    <cmp_op:SetpCompareOp> <bool_op:SetpBoolPostOp> <ftz:".ftz"?> ".f32" => ast::SetpBoolData {
-        typ: ast::ScalarType::F32,
-        flush_to_zero: Some(ftz.is_some()),
-        cmp_op: cmp_op,
-        bool_op: bool_op,
-    }
-};
-
-SetpCompareOp: ast::SetpCompareOp = {
-    ".eq" => ast::SetpCompareOp::Eq,
-    ".ne" => ast::SetpCompareOp::NotEq,
-    ".lt" => ast::SetpCompareOp::Less,
-    ".le" => ast::SetpCompareOp::LessOrEq,
-    ".gt" => ast::SetpCompareOp::Greater,
-    ".ge" => ast::SetpCompareOp::GreaterOrEq,
-    ".lo" => ast::SetpCompareOp::Less,
-    ".ls" => ast::SetpCompareOp::LessOrEq,
-    ".hi" => ast::SetpCompareOp::Greater,
-    ".hs" => ast::SetpCompareOp::GreaterOrEq,
-    ".equ" => ast::SetpCompareOp::NanEq,
-    ".neu" => ast::SetpCompareOp::NanNotEq,
-    ".ltu" => ast::SetpCompareOp::NanLess,
-    ".leu" => ast::SetpCompareOp::NanLessOrEq,
-    ".gtu" => ast::SetpCompareOp::NanGreater,
-    ".geu" => ast::SetpCompareOp::NanGreaterOrEq,
-    ".num" => ast::SetpCompareOp::IsNotNan,
-    ".nan" => ast::SetpCompareOp::IsAnyNan,
-};
-
-SetpBoolPostOp: ast::SetpBoolPostOp = {
-    ".and" => ast::SetpBoolPostOp::And,
-    ".or" => ast::SetpBoolPostOp::Or,
-    ".xor" => ast::SetpBoolPostOp::Xor,
-};
-
-SetpTypeNoF32: ast::ScalarType = {
-    ".b16" => ast::ScalarType::B16,
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-    ".f64" => ast::ScalarType::F64,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not
-InstNot: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "not" <t:BooleanType> <a:Arg2> => ast::Instruction::Not(t, a)
-};
-
-BooleanType: ast::ScalarType = {
-    ".pred" => ast::ScalarType::Pred,
-    ".b16" => ast::ScalarType::B16,
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-at
-PredAt: ast::PredAt<&'input str> = {
-    "@" <label:ExtendedID> => ast::PredAt { not: false, label:label },
-    "@" "!" <label:ExtendedID> => ast::PredAt { not: true, label:label }
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra
-InstBra: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "bra" <u:".uni"?> <a:Arg1> => ast::Instruction::Bra(ast::BraData{ uniform: u.is_some() }, a)
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
-InstCvt: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "cvt" <s:".sat"?> <dst_t:CvtTypeInt> <src_t:CvtTypeInt> <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::new_int_from_int_checked(
-            s.is_some(),
-            dst_t,
-            src_t,
-            errors
-        ),
-        a)
-    },
-    "cvt" <r:RoundingModeFloat> <f:".ftz"?> <s:".sat"?> <dst_t:CvtTypeFloat> <src_t:CvtTypeInt> <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::new_float_from_int_checked(
-            r,
-            f.is_some(),
-            s.is_some(),
-            dst_t,
-            src_t,
-            errors
-        ),
-        a)
-    },
-    "cvt" <r:RoundingModeInt> <f:".ftz"?> <s:".sat"?> <dst_t:CvtTypeInt> <src_t:CvtTypeFloat> <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::new_int_from_float_checked(
-            r,
-            f.is_some(),
-            s.is_some(),
-            dst_t,
-            src_t,
-            errors
-        ),
-        a)
-    },
-    "cvt" <r:RoundingModeInt?> <s:".sat"?> ".f16" ".f16" <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
-            ast::CvtDesc {
-                rounding: r,
-                flush_to_zero: None,
-                saturate: s.is_some(),
-                dst: ast::ScalarType::F16,
-                src: ast::ScalarType::F16
-            }
-        ), a)
-    },
-    "cvt" <f:".ftz"?> <s:".sat"?> ".f32" ".f16" <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
-            ast::CvtDesc {
-                rounding: None,
-                flush_to_zero: Some(f.is_some()),
-                saturate: s.is_some(),
-                dst: ast::ScalarType::F32,
-                src: ast::ScalarType::F16
-            }
-        ), a)
-    },
-    "cvt" <s:".sat"?> ".f64" ".f16" <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
-            ast::CvtDesc {
-                rounding: None,
-                flush_to_zero: None,
-                saturate: s.is_some(),
-                dst: ast::ScalarType::F64,
-                src: ast::ScalarType::F16
-            }
-        ), a)
-    },
-    "cvt" <r:RoundingModeFloat> <f:".ftz"?> <s:".sat"?> ".f16" ".f32" <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
-            ast::CvtDesc {
-                rounding: Some(r),
-                flush_to_zero: Some(f.is_some()),
-                saturate: s.is_some(),
-                dst: ast::ScalarType::F16,
-                src: ast::ScalarType::F32
-            }
-        ), a)
-    },
-    "cvt" <r:RoundingModeInt?> <f:".ftz"?> <s:".sat"?> ".f32" ".f32" <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
-            ast::CvtDesc {
-                rounding: r,
-                flush_to_zero: Some(f.is_some()),
-                saturate: s.is_some(),
-                dst: ast::ScalarType::F32,
-                src: ast::ScalarType::F32
-            }
-        ), a)
-    },
-    "cvt" <s:".sat"?> <f:".ftz"?> ".f64" ".f32" <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
-            ast::CvtDesc {
-                rounding: None,
-                flush_to_zero: Some(f.is_some()),
-                saturate: s.is_some(),
-                dst: ast::ScalarType::F64,
-                src: ast::ScalarType::F32
-            }
-        ), a)
-    },
-    "cvt" <r:RoundingModeFloat> <s:".sat"?> ".f16" ".f64" <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
-            ast::CvtDesc {
-                rounding: Some(r),
-                flush_to_zero: None,
-                saturate: s.is_some(),
-                dst: ast::ScalarType::F16,
-                src: ast::ScalarType::F64
-            }
-        ), a)
-    },
-    "cvt" <r:RoundingModeFloat> <f:".ftz"?> <s:".sat"?> ".f32" ".f64" <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
-            ast::CvtDesc {
-                rounding: Some(r),
-                flush_to_zero: Some(s.is_some()),
-                saturate: s.is_some(),
-                dst: ast::ScalarType::F32,
-                src: ast::ScalarType::F64
-            }
-        ), a)
-    },
-    "cvt" <r:RoundingModeInt?> <s:".sat"?> ".f64" ".f64" <a:Arg2> => {
-        ast::Instruction::Cvt(ast::CvtDetails::FloatFromFloat(
-            ast::CvtDesc {
-                rounding: r,
-                flush_to_zero: None,
-                saturate: s.is_some(),
-                dst: ast::ScalarType::F64,
-                src: ast::ScalarType::F64
-            }
-        ), a)
-    },
-};
-
-CvtTypeInt: ast::ScalarType = {
-    ".u8" => ast::ScalarType::U8,
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s8" => ast::ScalarType::S8,
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-};
-
-CvtTypeFloat: ast::ScalarType = {
-    ".f16" => ast::ScalarType::F16,
-    ".f32" => ast::ScalarType::F32,
-    ".f64" => ast::ScalarType::F64,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl
-InstShl: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "shl" <t:ShlType> <a:Arg3> => ast::Instruction::Shl(t, a)
-};
-
-ShlType: ast::ScalarType = {
-    ".b16" => ast::ScalarType::B16,
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr
-InstShr: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "shr" <t:ShrType> <a:Arg3> => ast::Instruction::Shr(t, a)
-};
-
-ShrType: ast::ScalarType = {
-    ".b16" => ast::ScalarType::B16,
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
-// Warning: NVIDIA documentation is incorrect, you can specify scope only once
-InstSt: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "st" <q:LdStQualifier?> <ss:StStateSpace?> <cop:StCacheOperator?> <t:LdStType> <src1:MemoryOperand> "," <src2:SrcOperandVec> => {
-        ast::Instruction::St(
-            ast::StData {
-                qualifier: q.unwrap_or(ast::LdStQualifier::Weak),
-                state_space: ss.unwrap_or(ast::StateSpace::Generic),
-                caching: cop.unwrap_or(ast::StCacheOperator::Writeback),
-                typ: t
-            },
-            ast::Arg2St { src1:src1, src2:src2 }
-        )
-    }
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#using-addresses-arrays-and-vectors
-MemoryOperand: ast::Operand<&'input str> = {
-    "[" <o:Operand> "]" => o
-}
-
-StStateSpace: ast::StateSpace = {
-    ".global" => ast::StateSpace::Global,
-    ".local" => ast::StateSpace::Local,
-    ".param" => ast::StateSpace::Param,
-    ".shared" => ast::StateSpace::Shared,
-};
-
-StCacheOperator: ast::StCacheOperator = {
-    ".wb" => ast::StCacheOperator::Writeback,
-    ".cg" => ast::StCacheOperator::L2Only,
-    ".cs" => ast::StCacheOperator::Streaming,
-    ".wt" => ast::StCacheOperator::Writethrough,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
-InstRet: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "ret" <u:".uni"?> => ast::Instruction::Ret(ast::RetData { uniform: u.is_some() })
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
-InstCvta: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "cvta" <from:CvtaStateSpace> <s:CvtaSize> <a:Arg2> => {
-        ast::Instruction::Cvta(ast::CvtaDetails {
-            to: ast::StateSpace::Generic,
-            from,
-            size: s
-        },
-        a)
-    },
-    "cvta" ".to" <to:CvtaStateSpace> <s:CvtaSize> <a:Arg2> => {
-        ast::Instruction::Cvta(ast::CvtaDetails {
-            to,
-            from: ast::StateSpace::Generic,
-            size: s
-        },
-        a)
-    }
-}
-
-CvtaStateSpace: ast::StateSpace = {
-    ".const" => ast::StateSpace::Const,
-    ".global" => ast::StateSpace::Global,
-    ".local" => ast::StateSpace::Local,
-    ".shared" => ast::StateSpace::Shared,
-}
-
-CvtaSize: ast::CvtaSize = {
-    ".u32" => ast::CvtaSize::U32,
-    ".u64" => ast::CvtaSize::U64,
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call
-InstCall: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "call" <u:".uni"?> <args:ArgCall> => {
-        let (ret_params, func, param_list) = args;
-        ast::Instruction::Call(ast::CallInst { uniform: u.is_some(), ret_params, func, param_list })
-    }
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs
-InstAbs: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "abs" <t:SignedIntType> <a:Arg2> => {
-        ast::Instruction::Abs(ast::AbsDetails { flush_to_zero: None, typ: t }, a)
-    },
-    "abs" <f:".ftz"?> ".f32" <a:Arg2> => {
-        ast::Instruction::Abs(ast::AbsDetails { flush_to_zero: Some(f.is_some()), typ: ast::ScalarType::F32 }, a)
-    },
-    "abs" ".f64" <a:Arg2> => {
-        ast::Instruction::Abs(ast::AbsDetails { flush_to_zero: None, typ: ast::ScalarType::F64 }, a)
-    },
-    "abs" <f:".ftz"?> ".f16" <a:Arg2> => {
-        ast::Instruction::Abs(ast::AbsDetails { flush_to_zero: Some(f.is_some()), typ: ast::ScalarType::F16 }, a)
-    },
-    "abs" <f:".ftz"?> ".f16x2" <a:Arg2> => {
-        ast::Instruction::Abs(ast::AbsDetails { flush_to_zero: Some(f.is_some()), typ: ast::ScalarType::F16x2 }, a)
-    },
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mad
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad
-InstMad: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "mad" <d:MulDetails> <a:Arg4> => ast::Instruction::Mad(d, a),
-    "mad" ".hi" ".sat" ".s32" => todo!(),
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
-InstFma: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "fma" <f:ArithFloatMustRound> <a:Arg4> => ast::Instruction::Fma(f, a),
-};
-
-SignedIntType: ast::ScalarType = {
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or
-InstOr: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "or" <d:BooleanType> <a:Arg3> => ast::Instruction::Or(d, a),
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and
-InstAnd: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "and" <d:BooleanType> <a:Arg3> => ast::Instruction::And(d, a),
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp
-InstRcp: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "rcp" <rounding:RcpRoundingMode> <ftz:".ftz"?> ".f32" <a:Arg2> => {
-        let details = ast::RcpDetails {
-            rounding,
-            flush_to_zero: Some(ftz.is_some()),
-            is_f64: false,
-        };
-        ast::Instruction::Rcp(details, a)
-    },
-    "rcp" <rn:RoundingModeFloat> ".f64" <a:Arg2> => {
-        let details = ast::RcpDetails {
-            rounding: Some(rn),
-            flush_to_zero: None,
-            is_f64: true,
-        };
-        ast::Instruction::Rcp(details, a)
-    }
-};
-
-RcpRoundingMode: Option<ast::RoundingMode> = {
-    ".approx" => None,
-    <r:RoundingModeFloat> => Some(r)
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sub
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub
-InstSub: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "sub" <d:ArithDetails> <a:Arg3> => ast::Instruction::Sub(d, a),
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min
-InstMin: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "min" <d:MinMaxDetails> <a:Arg3> => ast::Instruction::Min(d, a),
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max
-InstMax: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "max" <d:MinMaxDetails> <a:Arg3> => ast::Instruction::Max(d, a),
-};
-
-MinMaxDetails: ast::MinMaxDetails = {
-    <t:UIntType> => ast::MinMaxDetails::Unsigned(t),
-    <t:SIntType> => ast::MinMaxDetails::Signed(t),
-    <ftz:".ftz"?> <nan:".NaN"?> ".f32" => ast::MinMaxDetails::Float(
-        ast::MinMaxFloat{ flush_to_zero: Some(ftz.is_some()), nan: nan.is_some(), typ: ast::ScalarType::F32 }
-    ),
-    ".f64" => ast::MinMaxDetails::Float(
-        ast::MinMaxFloat{ flush_to_zero: None, nan: false, typ: ast::ScalarType::F64 }
-    ),
-    <ftz:".ftz"?> <nan:".NaN"?> ".f16" => ast::MinMaxDetails::Float(
-        ast::MinMaxFloat{ flush_to_zero: Some(ftz.is_some()), nan: nan.is_some(), typ: ast::ScalarType::F16 }
-    ),
-    <ftz:".ftz"?> <nan:".NaN"?> ".f16x2" => ast::MinMaxDetails::Float(
-        ast::MinMaxFloat{ flush_to_zero: Some(ftz.is_some()), nan: nan.is_some(), typ: ast::ScalarType::F16x2 }
-    )
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
-InstSelp: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "selp" <t:SelpType> <a:Arg4> => ast::Instruction::Selp(t, a),
-};
-
-SelpType: ast::ScalarType = {
-    ".b16" => ast::ScalarType::B16,
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-    ".u16" => ast::ScalarType::U16,
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-    ".f32" => ast::ScalarType::F32,
-    ".f64" => ast::ScalarType::F64,
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar
-InstBar: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "bar" ".sync" <a:Arg1Bar> => ast::Instruction::Bar(ast::BarDetails::SyncAligned, a),
-    "barrier" ".sync" <a:Arg1Bar> => ast::Instruction::Bar(ast::BarDetails::SyncAligned, a),
-    "barrier" ".sync" ".aligned" <a:Arg1Bar> => ast::Instruction::Bar(ast::BarDetails::SyncAligned, a),
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
-// The documentation does not mention all spported operations:
-// * Operation .add requires .u32 or .s32 or .u64 or .f64 or f16 or f16x2 or .f32
-// * Operation .inc requires .u32 type for instuction
-// * Operation .dec requires .u32 type for instuction
-// Otherwise as documented
-InstAtom: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> <op:AtomBitOp> <typ:BitType> <a:Arg3Atom> => {
-        let details = ast::AtomDetails {
-            semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
-            scope: scope.unwrap_or(ast::MemScope::Gpu),
-            space: space.unwrap_or(ast::StateSpace::Generic),
-            inner: ast::AtomInnerDetails::Bit { op, typ }
-        };
-        ast::Instruction::Atom(details,a)
-    },
-    "atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".inc" ".u32" <a:Arg3Atom> => {
-        let details = ast::AtomDetails {
-            semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
-            scope: scope.unwrap_or(ast::MemScope::Gpu),
-            space: space.unwrap_or(ast::StateSpace::Generic),
-            inner: ast::AtomInnerDetails::Unsigned {
-                op: ast::AtomUIntOp::Inc,
-                typ: ast::ScalarType::U32
-            }
-        };
-        ast::Instruction::Atom(details,a)
-    },
-    "atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".dec" ".u32" <a:Arg3Atom> => {
-        let details = ast::AtomDetails {
-            semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
-            scope: scope.unwrap_or(ast::MemScope::Gpu),
-            space: space.unwrap_or(ast::StateSpace::Generic),
-            inner: ast::AtomInnerDetails::Unsigned {
-                op: ast::AtomUIntOp::Dec,
-                typ: ast::ScalarType::U32
-            }
-        };
-        ast::Instruction::Atom(details,a)
-    },
-    "atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".add" <typ:FloatType> <a:Arg3Atom> => {
-        let op = ast::AtomFloatOp::Add;
-        let details = ast::AtomDetails {
-            semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
-            scope: scope.unwrap_or(ast::MemScope::Gpu),
-            space: space.unwrap_or(ast::StateSpace::Generic),
-            inner: ast::AtomInnerDetails::Float { op, typ }
-        };
-        ast::Instruction::Atom(details,a)
-    },
-    "atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> <op: AtomUIntOp> <typ:UIntType3264> <a:Arg3Atom> => {
-        let details = ast::AtomDetails {
-            semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
-            scope: scope.unwrap_or(ast::MemScope::Gpu),
-            space: space.unwrap_or(ast::StateSpace::Generic),
-            inner: ast::AtomInnerDetails::Unsigned { op, typ }
-        };
-        ast::Instruction::Atom(details,a)
-    },
-    "atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> <op: AtomSIntOp> <typ:SIntType3264> <a:Arg3Atom> => {
-        let details = ast::AtomDetails {
-            semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
-            scope: scope.unwrap_or(ast::MemScope::Gpu),
-            space: space.unwrap_or(ast::StateSpace::Generic),
-            inner: ast::AtomInnerDetails::Signed { op, typ }
-        };
-        ast::Instruction::Atom(details,a)
-    }
-}
-
-InstAtomCas: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".cas" <typ:BitType> <a:Arg4Atom> => {
-        let details = ast::AtomCasDetails {
-            semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
-            scope: scope.unwrap_or(ast::MemScope::Gpu),
-            space: space.unwrap_or(ast::StateSpace::Generic),
-            typ,
-        };
-        ast::Instruction::AtomCas(details,a)
-    },
-}
-
-AtomSemantics: ast::AtomSemantics = {
-    ".relaxed" => ast::AtomSemantics::Relaxed,
-    ".acquire" => ast::AtomSemantics::Acquire,
-    ".release" => ast::AtomSemantics::Release,
-    ".acq_rel" => ast::AtomSemantics::AcquireRelease
-}
-
-AtomSpace: ast::StateSpace = {
-    ".global" => ast::StateSpace::Global,
-    ".shared" => ast::StateSpace::Shared
-}
-
-AtomBitOp: ast::AtomBitOp = {
-    ".and" => ast::AtomBitOp::And,
-    ".or" => ast::AtomBitOp::Or,
-    ".xor" => ast::AtomBitOp::Xor,
-    ".exch" => ast::AtomBitOp::Exchange,
-}
-
-AtomUIntOp: ast::AtomUIntOp = {
-    ".add" => ast::AtomUIntOp::Add,
-    ".min" => ast::AtomUIntOp::Min,
-    ".max" => ast::AtomUIntOp::Max,
-}
-
-AtomSIntOp: ast::AtomSIntOp = {
-    ".add" => ast::AtomSIntOp::Add,
-    ".min" => ast::AtomSIntOp::Min,
-    ".max" => ast::AtomSIntOp::Max,
-}
-
-BitType: ast::ScalarType = {
-    ".b32" => ast::ScalarType::B32,
-    ".b64" => ast::ScalarType::B64,
-}
-
-UIntType3264: ast::ScalarType = {
-    ".u32" => ast::ScalarType::U32,
-    ".u64" => ast::ScalarType::U64,
-}
-
-SIntType3264: ast::ScalarType = {
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div
-InstDiv: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "div" <t:UIntType> <a:Arg3> => ast::Instruction::Div(ast::DivDetails::Unsigned(t), a),
-    "div" <t:SIntType> <a:Arg3> => ast::Instruction::Div(ast::DivDetails::Signed(t), a),
-    "div" <kind:DivFloatKind> <ftz:".ftz"?> ".f32" <a:Arg3> => {
-        let inner = ast::DivFloatDetails {
-            typ: ast::ScalarType::F32,
-            flush_to_zero: Some(ftz.is_some()),
-            kind
-        };
-        ast::Instruction::Div(ast::DivDetails::Float(inner), a)
-    },
-    "div" <rnd:RoundingModeFloat> ".f64" <a:Arg3> => {
-        let inner = ast::DivFloatDetails {
-            typ: ast::ScalarType::F64,
-            flush_to_zero: None,
-            kind: ast::DivFloatKind::Rounding(rnd)
-        };
-        ast::Instruction::Div(ast::DivDetails::Float(inner), a)
-    },
-}
-
-DivFloatKind: ast::DivFloatKind = {
-    ".approx" => ast::DivFloatKind::Approx,
-    ".full" => ast::DivFloatKind::Full,
-    <rnd:RoundingModeFloat> => ast::DivFloatKind::Rounding(rnd),
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt
-InstSqrt: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "sqrt" ".approx" <ftz:".ftz"?> ".f32" <a:Arg2> => {
-        let details = ast::SqrtDetails {
-            typ: ast::ScalarType::F32,
-            flush_to_zero: Some(ftz.is_some()),
-            kind: ast::SqrtKind::Approx,
-        };
-        ast::Instruction::Sqrt(details, a)
-    },
-    "sqrt" <rnd:RoundingModeFloat> <ftz:".ftz"?> ".f32" <a:Arg2> => {
-        let details = ast::SqrtDetails {
-            typ: ast::ScalarType::F32,
-            flush_to_zero: Some(ftz.is_some()),
-            kind: ast::SqrtKind::Rounding(rnd),
-        };
-        ast::Instruction::Sqrt(details, a)
-    },
-    "sqrt" <rnd:RoundingModeFloat> ".f64" <a:Arg2> => {
-        let details = ast::SqrtDetails {
-            typ: ast::ScalarType::F64,
-            flush_to_zero: None,
-            kind: ast::SqrtKind::Rounding(rnd),
-        };
-        ast::Instruction::Sqrt(details, a)
-    }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64
-InstRsqrt: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "rsqrt" ".approx" <ftz:".ftz"?> ".f32" <a:Arg2> => {
-        let details = ast::RsqrtDetails {
-            typ: ast::ScalarType::F32,
-            flush_to_zero: ftz.is_some(),
-        };
-        ast::Instruction::Rsqrt(details, a)
-    },
-    "rsqrt" ".approx" <ftz:".ftz"?> ".f64" <a:Arg2> => {
-        let details = ast::RsqrtDetails {
-            typ: ast::ScalarType::F64,
-            flush_to_zero: ftz.is_some(),
-        };
-        ast::Instruction::Rsqrt(details, a)
-    },
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg
-InstNeg: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "neg" <ftz:".ftz"?> <typ:NegTypeFtz> <a:Arg2> => {
-        let details = ast::NegDetails {
-            typ,
-            flush_to_zero: Some(ftz.is_some()),
-        };
-        ast::Instruction::Neg(details, a)
-    },
-    "neg" <typ:NegTypeNonFtz> <a:Arg2> => {
-        let details = ast::NegDetails {
-            typ,
-            flush_to_zero: None,
-        };
-        ast::Instruction::Neg(details, a)
-    },
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin
-InstSin: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "sin" ".approx" <ftz:".ftz"?> ".f32" <arg:Arg2> => {
-        ast::Instruction::Sin{ flush_to_zero: ftz.is_some(), arg }
-    },
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos
-InstCos: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "cos" ".approx" <ftz:".ftz"?> ".f32" <arg:Arg2> => {
-        ast::Instruction::Cos{ flush_to_zero: ftz.is_some(), arg }
-    },
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2
-InstLg2: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "lg2" ".approx" <ftz:".ftz"?> ".f32" <arg:Arg2> => {
-        ast::Instruction::Lg2{ flush_to_zero: ftz.is_some(), arg }
-    },
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2
-InstEx2: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "ex2" ".approx" <ftz:".ftz"?> ".f32" <arg:Arg2> => {
-        ast::Instruction::Ex2{ flush_to_zero: ftz.is_some(), arg }
-    },
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz
-InstClz: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "clz" <typ:BitType> <arg:Arg2> => ast::Instruction::Clz{ <> }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev
-InstBrev: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "brev" <typ:BitType> <arg:Arg2> => ast::Instruction::Brev{ <> }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc
-InstPopc: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "popc" <typ:BitType> <arg:Arg2> => ast::Instruction::Popc{ <> }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor
-InstXor: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "xor" <typ:BooleanType> <arg:Arg3> => ast::Instruction::Xor{ <> }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe
-InstBfe: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "bfe" <typ:IntType3264> <arg:Arg4> => ast::Instruction::Bfe{ <> }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi
-InstBfi: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "bfi" <typ:BitType> <arg:Arg5> => ast::Instruction::Bfi{ <> }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
-InstPrmt: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "prmt" ".b32" <arg:Arg3> "," <control:U16Num> => ast::Instruction::Prmt{ <> }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem
-InstRem: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "rem" <typ:IntType> <arg:Arg3> => ast::Instruction::Rem{ <> }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
-InstActivemask: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "activemask" ".b32" <arg:Arg1> => ast::Instruction::Activemask{ <> }
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
-InstMembar: ast::Instruction<ast::ParsedArgParams<'input>> = {
-    "membar" <level:MembarLevel> => ast::Instruction::Membar{ <> }
-}
-
-NegTypeFtz: ast::ScalarType = {
-    ".f16" => ast::ScalarType::F16,
-    ".f16x2" => ast::ScalarType::F16x2,
-    ".f32" => ast::ScalarType::F32,
-}
-
-NegTypeNonFtz: ast::ScalarType = {
-    ".s16" => ast::ScalarType::S16,
-    ".s32" => ast::ScalarType::S32,
-    ".s64" => ast::ScalarType::S64,
-    ".f64" => ast::ScalarType::F64
-}
-
-ArithDetails: ast::ArithDetails = {
-    <t:UIntType> => ast::ArithDetails::Unsigned(t),
-    <t:SIntType> => ast::ArithDetails::Signed(ast::ArithSInt {
-        typ: t,
-        saturate: false,
-    }),
-    ".sat" ".s32" => ast::ArithDetails::Signed(ast::ArithSInt {
-        typ: ast::ScalarType::S32,
-        saturate: true,
-    }),
-    <f:ArithFloat> => ast::ArithDetails::Float(f)
-}
-
-ArithFloat: ast::ArithFloat = {
-    <rn:RoundingModeFloat?> <ftz:".ftz"?> <sat:".sat"?> ".f32" => ast::ArithFloat {
-        typ: ast::ScalarType::F32,
-        rounding: rn,
-        flush_to_zero: Some(ftz.is_some()),
-        saturate: sat.is_some(),
-    },
-    <rn:RoundingModeFloat?> ".f64" => ast::ArithFloat {
-        typ: ast::ScalarType::F64,
-        rounding: rn,
-        flush_to_zero: None,
-        saturate: false,
-    },
-    <rn:".rn"?> <ftz:".ftz"?> <sat:".sat"?> ".f16" => ast::ArithFloat {
-        typ: ast::ScalarType::F16,
-        rounding: rn.map(|_| ast::RoundingMode::NearestEven),
-        flush_to_zero: Some(ftz.is_some()),
-        saturate: sat.is_some(),
-    },
-    <rn:".rn"?> <ftz:".ftz"?> <sat:".sat"?> ".f16x2" => ast::ArithFloat {
-        typ: ast::ScalarType::F16x2,
-        rounding: rn.map(|_| ast::RoundingMode::NearestEven),
-        flush_to_zero: Some(ftz.is_some()),
-        saturate: sat.is_some(),
-    },
-}
-
-ArithFloatMustRound: ast::ArithFloat = {
-    <rn:RoundingModeFloat> <ftz:".ftz"?> <sat:".sat"?> ".f32" => ast::ArithFloat {
-        typ: ast::ScalarType::F32,
-        rounding: Some(rn),
-        flush_to_zero: Some(ftz.is_some()),
-        saturate: sat.is_some(),
-    },
-    <rn:RoundingModeFloat> ".f64" => ast::ArithFloat {
-        typ: ast::ScalarType::F64,
-        rounding: Some(rn),
-        flush_to_zero: None,
-        saturate: false,
-    },
-    ".rn" <ftz:".ftz"?> <sat:".sat"?> ".f16" => ast::ArithFloat {
-        typ: ast::ScalarType::F16,
-        rounding: Some(ast::RoundingMode::NearestEven),
-        flush_to_zero: Some(ftz.is_some()),
-        saturate: sat.is_some(),
-    },
-    ".rn" <ftz:".ftz"?> <sat:".sat"?> ".f16x2" => ast::ArithFloat {
-        typ: ast::ScalarType::F16x2,
-        rounding: Some(ast::RoundingMode::NearestEven),
-        flush_to_zero: Some(ftz.is_some()),
-        saturate: sat.is_some(),
-    },
-}
-
-Operand: ast::Operand<&'input str> = {
-    <r:ExtendedID> => ast::Operand::Reg(r),
-    <r:ExtendedID> "+" <offset:S32Num> => ast::Operand::RegOffset(r, offset),
-    <x:ImmediateValue> => ast::Operand::Imm(x)
-};
-
-CallOperand: ast::Operand<&'input str> = {
-    <r:ExtendedID> => ast::Operand::Reg(r),
-    <x:ImmediateValue> => ast::Operand::Imm(x)
-};
-
-// TODO: start parsing whole constants sub-language:
-//       https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#constants
-ImmediateValue: ast::ImmediateValue = {
-    // TODO: treat negation correctly
-    <neg:"-"?> <x:NumToken> => {
-        let (num, radix, is_unsigned) = x;
-        if neg.is_some() {
-            match i64::from_str_radix(num, radix) {
-                Ok(x) => ast::ImmediateValue::S64(-x),
-                Err(err) => {
-                    errors.push(ParseError::User { error: ast::PtxError::from(err) });
-                    ast::ImmediateValue::S64(0)
-                }
-            }
-        } else if is_unsigned {
-            match u64::from_str_radix(num, radix) {
-                Ok(x) => ast::ImmediateValue::U64(x),
-                Err(err) => {
-                    errors.push(ParseError::User { error: ast::PtxError::from(err) });
-                    ast::ImmediateValue::U64(0)
-                }
-            }
-        } else {
-            match i64::from_str_radix(num, radix) {
-                Ok(x) => ast::ImmediateValue::S64(x),
-                Err(_) => {
-                    match u64::from_str_radix(num, radix) {
-                        Ok(x) => ast::ImmediateValue::U64(x),
-                        Err(err) => {
-                            errors.push(ParseError::User { error: ast::PtxError::from(err) });
-                            ast::ImmediateValue::U64(0)
-                        }
-                    }
-                }
-            }
-        }
-    },
-    <f:F32Num> => {
-        ast::ImmediateValue::F32(f)
-    },
-    <f:F64Num> => {
-        ast::ImmediateValue::F64(f)
-    }
-}
-
-Arg1: ast::Arg1<ast::ParsedArgParams<'input>> = {
-    <src:ExtendedID> => ast::Arg1{<>}
-};
-
-Arg1Bar: ast::Arg1Bar<ast::ParsedArgParams<'input>> = {
-    <src:Operand> => ast::Arg1Bar{<>}
-};
-
-Arg2: ast::Arg2<ast::ParsedArgParams<'input>> = {
-    <dst:DstOperand> "," <src:Operand> => ast::Arg2{<>}
-};
-
-MemberOperand: (&'input str, u8) = {
-    <pref:ExtendedID> "." <suf:ExtendedID> => {
-        let suf_idx = match vector_index(suf) {
-            Ok(x) => x,
-            Err(err) => {
-                errors.push(err);
-                0
-            }
-        };
-        (pref, suf_idx)
-    },
-    <pref:ExtendedID> <suf:DotID> => {
-        let suf_idx = match vector_index(&suf[1..]) {
-            Ok(x) => x,
-            Err(err) => {
-                errors.push(err);
-                0
-            }
-        };
-        (pref, suf_idx)
-    }
-};
-
-VectorExtract: Vec<&'input str> = {
-    "{" <r1:ExtendedID> "," <r2:ExtendedID> "}" => {
-        vec![r1, r2]
-    },
-    "{" <r1:ExtendedID> "," <r2:ExtendedID> "," <r3:ExtendedID> "," <r4:ExtendedID> "}" => {
-        vec![r1, r2, r3, r4]
-    },
-};
-
-Arg3: ast::Arg3<ast::ParsedArgParams<'input>> = {
-    <dst:DstOperand> "," <src1:Operand> "," <src2:Operand> => ast::Arg3{<>}
-};
-
-Arg3Atom: ast::Arg3<ast::ParsedArgParams<'input>> = {
-    <dst:DstOperand> "," "[" <src1:Operand> "]" "," <src2:Operand> => ast::Arg3{<>}
-};
-
-Arg4: ast::Arg4<ast::ParsedArgParams<'input>> = {
-    <dst:DstOperand> "," <src1:Operand> "," <src2:Operand> ","  <src3:Operand> => ast::Arg4{<>}
-};
-
-Arg4Atom: ast::Arg4<ast::ParsedArgParams<'input>> = {
-    <dst:DstOperand> "," "[" <src1:Operand> "]" "," <src2:Operand> ","  <src3:Operand> => ast::Arg4{<>}
-};
-
-Arg4Setp: ast::Arg4Setp<ast::ParsedArgParams<'input>> = {
-    <dst1:ExtendedID> <dst2:OptionalDst?> "," <src1:Operand> "," <src2:Operand> => ast::Arg4Setp{<>}
-};
-
-Arg5: ast::Arg5<ast::ParsedArgParams<'input>> = {
-    <dst:DstOperand> "," <src1:Operand> "," <src2:Operand> ","  <src3:Operand> ","  <src4:Operand> => ast::Arg5{<>}
-};
-
-// TODO: pass src3 negation somewhere
-Arg5Setp: ast::Arg5Setp<ast::ParsedArgParams<'input>> = {
-    <dst1:ExtendedID> <dst2:OptionalDst?> "," <src1:Operand> "," <src2:Operand> "," "!"? <src3:Operand> => ast::Arg5Setp{<>}
-};
-
-ArgCall: (Vec<&'input str>, &'input str, Vec<ast::Operand<&'input str>>) = {
-    "(" <ret_params:Comma<ExtendedID>> ")" "," <func:ExtendedID> "," "(" <param_list:Comma<CallOperand>> ")" => {
-        (ret_params, func, param_list)
-    },
-    "(" <ret_params:Comma<ExtendedID>> ")" "," <func:ExtendedID> => {
-        (ret_params, func, Vec::new())
-    },
-    <func:ExtendedID> "," "(" <param_list:Comma<CallOperand>> ")" => (Vec::new(), func, param_list),
-    <func:ExtendedID> => (Vec::new(), func, Vec::<ast::Operand<_>>::new()),
-};
-
-OptionalDst: &'input str = {
-    "|" <dst2:ExtendedID> => dst2
-}
-
-SrcOperand: ast::Operand<&'input str> = {
-    <r:ExtendedID> => ast::Operand::Reg(r),
-    <r:ExtendedID> "+" <offset:S32Num> => ast::Operand::RegOffset(r, offset),
-    <x:ImmediateValue> => ast::Operand::Imm(x),
-    <mem_op:MemberOperand> => {
-        let (reg, idx) = mem_op;
-        ast::Operand::VecMember(reg, idx)
-    }
-}
-
-SrcOperandVec: ast::Operand<&'input str> = {
-    <normal:SrcOperand> => normal,
-    <vec:VectorExtract> => ast::Operand::VecPack(vec),
-}
-
-DstOperand: ast::Operand<&'input str> = {
-    <r:ExtendedID> => ast::Operand::Reg(r),
-    <mem_op:MemberOperand> => {
-        let (reg, idx) = mem_op;
-        ast::Operand::VecMember(reg, idx)
-    }
-}
-
-DstOperandVec: ast::Operand<&'input str> = {
-    <normal:DstOperand> => normal,
-    <vec:VectorExtract> => ast::Operand::VecPack(vec),
-}
-
-VectorPrefix: u8 = {
-    ".v2" => 2,
-    ".v4" => 4
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-file
-File = {
-    ".file" U32Num String ("," U32Num "," U32Num)?
-};
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-section
-Section = {
-    ".section" DotID "{" SectionDwarfLines* "}"
-};
-
-SectionDwarfLines: () = {
-    AnyBitType Comma<U32Num>,
-    ".b32" SectionLabel,
-    ".b64" SectionLabel,
-    ".b32" SectionLabel "+" U32Num,
-    ".b64" SectionLabel "+" U32Num,
-};
-
-SectionLabel = {
-    ID,
-    DotID
-};
-
-AnyBitType = {
-    ".b8", ".b16", ".b32", ".b64"
-};
-
-VariableScalar<T>: (Option<u32>, T, &'input str) = {
-    <align:Align?> <v_type:T> <name:ExtendedID> => {
-        (align, v_type, name)
-    }
-}
-
-VariableVector<T>: (Option<u32>, u8, T, &'input str) = {
-    <align:Align?> <v_len:VectorPrefix> <v_type:T> <name:ExtendedID> => {
-        (align, v_len, v_type, name)
-    }
-}
-
-// empty dimensions [0] means it's a pointer
-VariableArrayOrPointer<T>: (Option<u32>, T, &'input str, ast::ArrayOrPointer) = {
-    <align:Align?> <typ:SizedScalarType> <name:ExtendedID> <dims:ArrayDimensions> <init:ArrayInitializer?> => {
-        let mut dims = dims;
-        let array_init = match init {
-            Some(init) => {
-                let init_vec = match init.to_vec(typ, &mut dims) {
-                    Err(error) => {
-                        errors.push(ParseError::User { error });
-                        Vec::new()
-                    }
-                    Ok(x) => x
-                };
-                ast::ArrayOrPointer::Array { dimensions: dims, init: init_vec }
-            }
-            None => {
-                if dims.len() > 1 && dims.contains(&0) {
-                    errors.push(ParseError::User { error: ast::PtxError::ZeroDimensionArray });
-                }
-                match &*dims {
-                    [0] => ast::ArrayOrPointer::Pointer,
-                    _ => ast::ArrayOrPointer::Array { dimensions: dims, init: Vec::new() }
-                }
-            }
-        };
-        (align, typ, name, array_init)
-    }
-}
-
-// [0] and [] are treated the same
-ArrayDimensions: Vec<u32> = {
-    ArrayEmptyDimension => vec![0u32],
-    ArrayEmptyDimension <dims:ArrayDimension+> => {
-        let mut dims = dims;
-        let mut result = vec![0u32];
-        result.append(&mut dims);
-        result
-    },
-    <dims:ArrayDimension+> => dims
-}
-
-ArrayEmptyDimension = {
-    "[" "]" 
-}
-
-ArrayDimension: u32  = {
-    "[" <n:U32Num> "]" => n,
-}
-
-ArrayInitializer: ast::NumsOrArrays<'input> = {
-    "=" <nums:NumsOrArraysBracket> => nums
-}
-
-NumsOrArraysBracket: ast::NumsOrArrays<'input> = {
-    "{" <nums:NumsOrArrays> "}" => nums
-}
-
-NumsOrArrays: ast::NumsOrArrays<'input> = {
-    <n:Comma<NumsOrArraysBracket>> => ast::NumsOrArrays::Arrays(n),
-    <n:CommaNonEmpty<NumToken>> => ast::NumsOrArrays::Nums(n.into_iter().map(|(x,radix,_)| (x, radix)).collect()),
-}
-
-Comma<T>: Vec<T> = {
-    <v:(<T> ",")*> <e:T?> => match e {
-        None => v,
-        Some(e) => {
-            let mut v = v;
-            v.push(e);
-            v
-        }
-    }
-};
-
-CommaNonEmpty<T>: Vec<T> = {
-    <v:(<T> ",")*> <e:T> => {
-        let mut v = v;
-        v.push(e);
-        v
-    }
-};
-
-#[inline]
-Or<T1, T2>: T1 = {
-    T1,
-    T2
-}
-
-#[inline]
-Or3<T1, T2, T3>: T1 = {
-    T1,
-    T2,
-    T3
-}
-\ No newline at end of file