Hello,
Sorry for the long description, and thanks for your patience! I want to change scf::ForOp’s iterArgs type, I guess it should be easy, but I have been blocked here for a while. After some trial and error, I come out a patch, but it still has bugs in some cases.
This is the input mlir
%12 = vector.transfer_read %8[%c0, %c0], %cst {..} : memref<1x4xf32, #map1>, vector<1x1xf32>
%13 = vector.transfer_read %8[%c0, %c1], %cst {..} : memref<1x4xf32, #map1>, vector<1x1xf32>
%16:2 = scf.for %arg3 = %c0 to %c1024 step %c2
iter_args(%arg4 = %12, %arg5 = %13) ->
(vector<1x1xf32>, vector<1x1xf32>) {
...
%32 = vector.contract {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} %21, %22, %arg4 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
...
scf.yield %31, %32 : vector<1x1xf32>, vector<1x1xf32>
}
vector.transfer_write %16#1, %8[%c0, %c3] {..} : vector<1x1xf32>, memref<1x4xf32, #map1>
vector.transfer_write %16#0, %8[%c0, %c2] {..} : vector<1x1xf32>, memref<1x4xf32, #map1>
Through the ConvertVectorToGPUPass, vector is converted to standard:
%12 = load %8[%c0, %c0] : memref<1x4xf32, #map1>
%13 = load %8[%c0, %c1] : memref<1x4xf32, #map1>
%16:2 = scf.for %arg3 = %c0 to %c1024 step %c2
iter_args(%arg4 = %12, %arg5 = %13) ->
(f32, f32) {
...
%43 = mulf %27, %29 : f32
%44 = addf %43, %arg4 : f32
...
scf.yield %44, %46 : f32, f32
}
store %16#1, %8[%c0, %c3] {..} : f32, memref<1x4xf32, #map1>
store %16#0, %8[%c0, %c2] {..} : f32, memref<1x4xf32, #map1>
In order to convert vector.contract %21, %22, %arg4 successfully, I have to convert %arg4 to f32 type. (%21 and %22 will be f32 naturally, so we only need to convert %arg4).
This is my method, but it is buggy:
(1) I create a new scf::ForOp with f32 iterArgs, and erase the original one.
std::map<Operation *, Value> loopResultUserMap;
void convertForOpSignatureToF32(FuncOp funcOp, MLIRContext *ctx) {
bool changed = true;
while (changed) {
changed = false;
funcOp.walk([&](Operation *op) {
auto loop = dyn_cast<scf::ForOp>(op);
if (!loop || loop.getNumResults() == 0 ||
loop.getResult(0).getType() == FloatType::getF32(ctx))
return WalkResult::advance();
Create a new scf::ForOp with f32 iterArgs
llvm::SmallVector<Type, 4> newResultTypes;
for (unsigned i = 0; i < loop.getNumResults(); ++i)
newResultTypes.push_back(FloatType::getF32(ctx));
OpBuilder builder(loop);
auto newloop = cloneWithNewResultTypes(loop, newResultTypes);
builder.insert(newloop);
unsigned i = 0;
for (OpResult result : loop.getResults()) {
for (auto &use : result.getUses())
loopResultUserMap[use.getOwner()] = newloop->getResult(i);
++i;
}
I found I can’t directly use loop.getResult(i).replaceAllUsesWith(newloop->getResult(i));
, or I will get crash if I try to dump vector.contract later.
loop.erase();
changed = true;
return WalkResult::interrupt();
});
}
}
(2) In the partial conversion of vector dialect to gpu/standard dialect, when converting vector::TransferReadOp to LoadOp
LogicalResult matchAndRewrite(...) {
Value newOp = rewriter.create<LoadOp>(loc, op.memref(), op.indices());
rewriter.replaceOp(op, newOp);
If the load is used by scf::ForOp, change BlockArgument type to f32 as well.
for (auto &use : op.vector().getUses()) {
if (scf::ForOp loop = dyn_cast<scf::ForOp>(use.getOwner())) {
unsigned argNo = use.getOperandNumber() - 3;
BlockArgument newArg =
loop.getBody()->insertArgument(argNo + 1, newOp.getType());
loop.getBody()->getArgument(argNo + 2).replaceAllUsesWith(newArg);
loop.getBody()->eraseArgument(argNo + 2);
}
}
}
(3) When converting vector::TransferWriteOp to StoreOp
LogicalResult matchAndRewrite(...) {
Value operand0 = operands[0];
if (loopResultUserMap.find(op.getOperation()) != loopResultUserMap.end()) {
operand0 = loopResultUserMap[op.getOperation()];
}
If the store uses scf::ForOp’s yield result, change operand0 to use new scf::ForOp’s yield result.
rewriter.create<StoreOp>(loc, operand0, operands[1], op.indices());
rewriter.eraseOp(op);
}
After these steps, I can convert vector OP to standard OP, but I found if scf::ForOp’s iterArg count > 16 (<= 16 is fine), the compiler will crash.
I also tried SignatureConversion, but still can’t make it work.
Experiment 1:
class ForOpConversionHack final : public VectorToGPUPattern<scf::ForOp> {
public:
LogicalResult matchAndRewrite(..) const override
{
TypeConverter::SignatureConversion signatureConverter(loop.getNumOperands());
for (unsigned i = 3; i < loop.getNumOperands(); ++i)
signatureConverter.addInputs(i, FloatType::getF32(ctx));
rewriter.applySignatureConversion(&loop.getRegion(), signatureConverter);
return success();
}
};
OwningRewritePatternList pattern;
pattern.insert<ForOpConversionHack>(ctx);
std::unique_ptr<ConversionTarget> target = std::make_unique<ConversionTarget>(*ctx);
applyPartialConversion(func, *target, pattern);
Experiment 2:
ConversionPatternRewriter rewriter(ctx);
bool changed = true;
while (changed) {
changed = false;
func.walk([&](Operation *op) {
auto loop = dyn_cast<scf::ForOp>(op);
if (!loop || loop.getNumResults() == 0 || loop.getResult(0).getType() == FloatType::getF32(ctx))
return WalkResult::advance();
TypeConverter::SignatureConversion signatureConverter(loop.getNumOperands());
for (unsigned i = 3; i < loop.getNumOperands(); ++i)
signatureConverter.addInputs(i, FloatType::getF32(ctx));
rewriter.applySignatureConversion(&loop.getRegion(), signatureConverter);
return WalkResult::interrupt();
});
}
Thanks in advance!
CY