libTooling application to rewrite statements while preserving leading and trailing comments

Question

I am writing a source to source transformation tool using clang libTooling to transform C source code. This involves rewriting the following statement types: clang::ifStmt, clang::whileStmt, clang::forStmt and clang::doStmt. Each of these statements contains a condition. The example included below shows how the condition is handled for the clang::ifStmt, but the same code applies for the other statements.

I need to access the entire condition text (including any leading and trailing comments) and rewrite if (condition_text) as if(/*BEGIN*/condition_text/*END*/) while preserving any leading or trailing comment text from the condition_text (for example if (/*FOO*/i<10/*BAR*/) should be rewritten as if (/*BEGIN*//*FOO*/i<10/*BAR*//*END*/). I do not understand why I cannot use the clang::Lexer to search backwards and forwards through raw tokens to find the leading and trailing comments. The IfStmt callback from the RecursiveASTVisitor is shown as an example below:

    //! Visitor callback for 'clang::IfStmt'.
    bool VisitIfStmt(const clang::IfStmt *IS) const {
        if (IS->getCond()) {
            if (const auto& [SR, condString] = getSourceFromStmt(
                IS->getCond()); !condString.empty()) {
                const auto& SM = mContext.getSourceManager();
                const auto& LO = mContext.getLangOpts();
                // Replace the condition string.
                const auto probeText = std::format(
                    "/*BEGIN*/{}/*END*/"
                    , condString
                    , gProbeIndex++);
                mRewriter.ReplaceText(SR, probeText);
            }
        }
        // Returning true continues the traversal.
        return true;
    }

This uses a helper function to return an updated SourceRange and ConditionString which is where I am having problems.

Running the above against some very simple C code shown below, (note that the C code contains TRUE/FALSE macros so we have to be careful to include expansion locations:

#define TRUE 1
#define FALSE 0

void foo() {
    // Leading Macro test
    if (TRUE == 1) {
    }

    // Leading Comment, trailing Macro
    if (/*COMMENT*/0 == FALSE) {
    }

    // trailing Comment after Macro
    if (0 == TRUE/*COMMENT*/) {
    }

    // Leading comment
    if (/*COMMENT*/2 < t) {
    }

    // Trailing comment
    if (t < 2 /*COMMENT*/) {
    }
}

produces the following rewritten text.

#define TRUE 1
#define FALSE 0

void foo() {
    int t = 0;
    // Leading Macro test
    if (/*BEGIN*/TRUE == 1/*END*/) {
    }

    // Leading Comment, trailing Macro
    if (/*COMMENT*//*BEGIN*/0 == FALSE/*END*/) {
    }

    // trailing Comment after Macro
    if (/*BEGIN*/0 == TRUE/*COMMENT*/)/*END*/ {
    }

    // Leading comment
    if (/*COMMENT*//*BEGIN*/2 < t/*END*/) {
    }

    // Trailing comment
    if (/*BEGIN*/t < 2/*END*/ /*COMMENT*/) {
    }
}

The full application source is shown below:

// SYSTEM INCLUDES
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
using namespace llvm;

namespace {
    //! The tool category.
    cl::OptionCategory gToolCategory("Tool Category");

    //! The probe index
    unsigned gProbeIndex = 0u;
}

namespace tooling {
    class CProbeVisitor : public clang::RecursiveASTVisitor {
    public:
        explicit CProbeVisitor(
            clang::ASTContext& rContext,
            clang::Rewriter& rRewriter)
            : mContext{rContext}
            , mRewriter{rRewriter}
        {}

        static bool shouldTraversePostOrder() {
            // Must return true to traverse the AST in post-order.
            return true;
        }

        bool TraverseFunctionDecl(clang::FunctionDecl *FD) {
            const auto& SM = mContext.getSourceManager();
            if (!SM.isInMainFile(SM.getExpansionLoc(FD->getLocation()))) {
                // Skip this FunctionDecl and continue traversal.
                return true;
            }

            // Breadcrumb to access function name in other VisitStmt callbacks.
            mCurrentFunction = FD;

            // Now traverse the function body - should not need to check
            // locations in the visited statement callbacks.
            const bool shouldContinue = 
                RecursiveASTVisitor::TraverseFunctionDecl(FD);

            if (!shouldContinue) {
                // Stop traversal.
                return false;
            }

            // This is called when exiting the function.
            // You can put your function exit logic here.
            return true;  // Continue traversal.
        }

        //! Visitor callback for 'clang::IfStmt'.
        bool VisitIfStmt(const clang::IfStmt *IS) const {
            if (IS->getCond()) {
                if (const auto& [SR, condString] = getSourceFromStmt(
                    IS->getCond()); !condString.empty()) {
                    const auto& SM = mContext.getSourceManager();
                    const auto& LO = mContext.getLangOpts();
                    // Replace the condition string.
                    const auto probeText = std::format(
                        "/*BEGIN*/{}/*END*/"
                        , condString
                        , gProbeIndex++);
                    mRewriter.ReplaceText(SR, probeText);
                }
            }
            // Returning true continues the traversal.
            return true;
        }

        //! Visitor callback for "clang::ForStmt".
        bool VisitForStmt(const clang::ForStmt *FS) const {
            if (FS->getCond()) {
                if (const auto& [SR, condString] = getSourceFromStmt(
                    FS->getCond()); !condString.empty()) {
                    const auto& SM = mContext.getSourceManager();
                    const auto& LO = mContext.getLangOpts();
                    // Replace the condition string.
                    const auto probeText = std::format(
                        "/*BEGIN*/{}/*END*/"
                        , condString
                        , gProbeIndex++);
                    mRewriter.ReplaceText(SR, probeText);
                }
            }
            // Returning true continues the traversal.
            return true;
        }

        //! Visitor callback for "clang::TranslationUnitDecl".
        static bool VisitTranslationUnitDecl(const clang::TranslationUnitDecl *TU) {
            return true;
        }

    private:
        //! Helper method to get the stmt source (accounting for prior rewrites & expansion locations).
        std::pair getSourceFromStmt(const clang::Stmt* stmt) const {
            const auto& SM = mContext.getSourceManager();
            // for some reason this is start of condition (not leading comments up to the end of the trailing comments)
            const auto SR = getSourceRangeWithComments(stmt);
            if (SM.isWrittenInSameFile(SR.getBegin(), SR.getEnd())) {
                // Get the text *after* taking prior rewrites into account.
                return { SR, mRewriter.getRewrittenText(SR) };
            }
            return {};
        }

        auto getSourceRangeWithComments(
            const clang::Stmt* stmt) const -> clang::CharSourceRange {
            const auto& SM = mContext.getSourceManager();
            const auto& LO = mContext.getLangOpts();

            // Get the expansion SourceRange of the expression.
            const auto SR = SM.getExpansionRange(stmt->getSourceRange());
            auto beginLoc = SR.getBegin();
            auto endLoc = SR.getEnd();

            // Adjust the beginning location backwards from the SR.getBegin.
            beginLoc = clang::Lexer::GetBeginningOfToken(beginLoc, SM, LO);
            while (SM.isWrittenInSameFile(beginLoc, SR.getBegin())) {
                clang::Token token;
                if (!clang::Lexer::getRawToken(beginLoc.getLocWithOffset(-1), token, SM, LO) &&
                    token.is(clang::tok::comment)) {
                    beginLoc = token.getLocation();
                } else {
                    // failed to get the raw token.
                    break;
                }
            }

            // Adjust the end location to the end of the trailing comments
            endLoc = clang::Lexer::getLocForEndOfToken(endLoc, 0, SM, LO);
            while (SM.isWrittenInSameFile(SR.getEnd(), endLoc)) {
                clang::Token token;
                if (!clang::Lexer::getRawToken(endLoc, token, SM, LO) &&
                    token.is(clang::tok::comment)) {
                    endLoc = clang::Lexer::getLocForEndOfToken(token.getEndLoc(), 0, SM, LO);
                } else {
                    break;
                }
            }

            // Now beginLoc and endLoc include the leading and trailing comments
            return {{beginLoc, endLoc}, false};
        }

        clang::ASTContext& mContext;
        clang::Rewriter& mRewriter;
        const clang::FunctionDecl *mCurrentFunction = nullptr;
    };
}

// This is all boilerplate for a program using the Clang C++ API
// ("libTooling") but not using the "tooling" part specifically.
int main(int argc, char const **argv)
{
    // Copy the arguments into a vector of char pointers since that is
    // what 'createInvocationFromCommandLine' wants.
    std::vector commandLine;
    {
        // Path to the 'clang' binary that I am behaving like.  This path is
        // used to compute the location of compiler headers like stddef.h.
        // The Makefile sets 'CLANG_LLVM_INSTALL_DIR' on the compilation
        // command line.
        //commandLine.push_back("C:/tools/llvm-project/build-host/debug/bin/clang");
        commandLine.push_back("C:/tools/llvm/bin/clang");

        for (int i = 1; i < argc; ++i) {
            commandLine.push_back(argv[i]);
        }
    }

    // Parse the command line options.
    const std::shared_ptr compilerInvocation(
        clang::createInvocation(llvm::ArrayRef(commandLine)));
    if (!compilerInvocation) {
        // Command line parsing errors have already been printed.
        return 2;
    }

    // Boilerplate setup for 'LoadFromCompilerInvocationAction'.
    const auto pchContainerOps = std::make_shared();
    const clang::IntrusiveRefCntPtr diagnosticsEngine(
        clang::CompilerInstance::createDiagnostics(
            new clang::DiagnosticOptions));

    // Run the Clang parser to produce an AST.
    const std::unique_ptr ast(
        clang::ASTUnit::LoadFromCompilerInvocationAction(
            compilerInvocation,
            pchContainerOps,
            diagnosticsEngine));

    if (ast == nullptr || diagnosticsEngine->getNumErrors() > 0) {
        // Error messages have already been printed.
        return 2;
    }

    clang::ASTContext& astContext = ast->getASTContext();
    const auto& SM = astContext.getSourceManager();
    clang::Rewriter rewriter(astContext.getSourceManager(), astContext.getLangOpts());


    tooling::CProbeVisitor visitor(astContext, rewriter);
    visitor.TraverseDecl(astContext.getTranslationUnitDecl());

    const auto MainFileID = SM.getMainFileID();
    const auto MainFileRange = clang::SourceRange(
        SM.getLocForStartOfFile(MainFileID),
        SM.getLocForEndOfFile(MainFileID));

    const auto FinalSourceCode = rewriter.getRewrittenText(MainFileRange);

    // Print the final source code to the console
    std::cout << FinalSourceCode << '
';

    return 0;
}

libTooling application to rewrite statements while preserving leading and trailing comments

Answers (1)

Difficulties lexing backward

Getting the `if` condition

Getting the `for` condition

Complete demonstration program

Sample output

Related Questions

libTooling application to rewrite statements while preserving leading and trailing comments

Answers (1)

Difficulties lexing backward

Getting the if condition

Getting the for condition

Complete demonstration program

Sample output

Related Questions

Getting the `if` condition

Getting the `for` condition