Robert Harvey
Robert Harvey

Reputation: 180787

Using CefGlue to return HTML page from an Url

I am attempting to write an implementation for the following (prototype) method:

var result = browser.GetHtml(string url);

The reason I need this is because there are a number of pages that push a mound of Javascript to the browser, and then the Javascript renders the page. The only way to retrieve such pages reliably is to allow the Javascript to execute in a browser environment before retrieving the resulting HTML.

My current attempt is using CefGlue. After downloading this project and combining it with the code in this answer I came up with the following code (included here for completeness):

using System;
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Imaging;
using System.Drawing.Printing;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using Xilium.CefGlue;

namespace OffScreenCefGlue
{
    internal class Program
    {
        private static void Main(string[] args)
        {
            // Load CEF. This checks for the correct CEF version.
            CefRuntime.Load();

            // Start the secondary CEF process.
            var cefMainArgs = new CefMainArgs(new string[0]);
            var cefApp = new DemoCefApp();

            // This is where the code path divereges for child processes.
            if (CefRuntime.ExecuteProcess(cefMainArgs, cefApp) != -1)
            {
                Console.Error.WriteLine("CefRuntime could not create the secondary process.");
            }

            // Settings for all of CEF (e.g. process management and control).
            var cefSettings = new CefSettings
            {
                SingleProcess = false,
                MultiThreadedMessageLoop = true
            };

            // Start the browser process (a child process).
            CefRuntime.Initialize(cefMainArgs, cefSettings, cefApp);

            // Instruct CEF to not render to a window at all.
            CefWindowInfo cefWindowInfo = CefWindowInfo.Create();
            cefWindowInfo.SetAsOffScreen(IntPtr.Zero);

            // Settings for the browser window itself (e.g. should JavaScript be enabled?).
            var cefBrowserSettings = new CefBrowserSettings();

            // Initialize some the cust interactions with the browser process.
            // The browser window will be 1280 x 720 (pixels).
            var cefClient = new DemoCefClient(1280, 720);

            // Start up the browser instance.
            string url = "http://www.reddit.com/";
            CefBrowserHost.CreateBrowser(cefWindowInfo, cefClient, cefBrowserSettings, url);

            // Hang, to let the browser do its work.
            Console.Read();

            // Clean up CEF.
            CefRuntime.Shutdown();
        }
    }

    internal class DemoCefApp : CefApp
    {
    }

    internal class DemoCefClient : CefClient
    {
        private readonly DemoCefLoadHandler _loadHandler;
        private readonly DemoCefRenderHandler _renderHandler;

        public DemoCefClient(int windowWidth, int windowHeight)
        {
            _renderHandler = new DemoCefRenderHandler(windowWidth, windowHeight);
            _loadHandler = new DemoCefLoadHandler();
        }

        protected override CefRenderHandler GetRenderHandler()
        {
            return _renderHandler;
        }

        protected override CefLoadHandler GetLoadHandler()
        {
            return _loadHandler;
        }
    }

    internal class DemoCefLoadHandler : CefLoadHandler
    {
        public string Html { get; private set; }

        protected override void OnLoadStart(CefBrowser browser, CefFrame frame)
        {
            // A single CefBrowser instance can handle multiple requests
            //   for a single URL if there are frames (i.e. <FRAME>, <IFRAME>).
            if (frame.IsMain)
            {
                Console.WriteLine("START: {0}", browser.GetMainFrame().Url);
            }
        }

        protected override async void OnLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode)
        {
            if (frame.IsMain)
            {
                Html = await browser.GetSourceAsync();
                Console.WriteLine("END: {0}, {1}", browser.GetMainFrame().Url, httpStatusCode);
            }
        }
    }

    internal class DemoCefRenderHandler : CefRenderHandler
    {
        private readonly int _windowHeight;
        private readonly int _windowWidth;

        public DemoCefRenderHandler(int windowWidth, int windowHeight)
        {
            _windowWidth = windowWidth;
            _windowHeight = windowHeight;
        }

        protected override bool GetRootScreenRect(CefBrowser browser, ref CefRectangle rect)
        {
            return GetViewRect(browser, ref rect);
        }

        protected override bool GetScreenPoint(CefBrowser browser, int viewX, int viewY, ref int screenX, ref int screenY)
        {
            screenX = viewX;
            screenY = viewY;
            return true;
        }

        protected override bool GetViewRect(CefBrowser browser, ref CefRectangle rect)
        {
            rect.X = 0;
            rect.Y = 0;
            rect.Width = _windowWidth;
            rect.Height = _windowHeight;
            return true;
        }

        protected override bool GetScreenInfo(CefBrowser browser, CefScreenInfo screenInfo)
        {
            return false;
        }

        protected override void OnPopupSize(CefBrowser browser, CefRectangle rect)
        {
        }

        protected override void OnPaint(CefBrowser browser, CefPaintElementType type, CefRectangle[] dirtyRects, IntPtr buffer, int width, int height)
        {
            // Save the provided buffer (a bitmap image) as a PNG.
            var bitmap = new Bitmap(width, height, width*4, PixelFormat.Format32bppRgb, buffer);
            bitmap.Save("LastOnPaint.png", ImageFormat.Png);
        }

        protected override void OnCursorChange(CefBrowser browser, IntPtr cursorHandle)
        {
        }

        protected override void OnScrollOffsetChanged(CefBrowser browser)
        {
        }
    }

    public class TaskStringVisitor : CefStringVisitor
    {
        private readonly TaskCompletionSource<string> taskCompletionSource;

        public TaskStringVisitor()
        {
            taskCompletionSource = new TaskCompletionSource<string>();
        }

        protected override void Visit(string value)
        {
            taskCompletionSource.SetResult(value);
        }

        public Task<string> Task
        {
            get { return taskCompletionSource.Task; }
        }
    }

    public static class CEFExtensions
    {
        public static Task<string> GetSourceAsync(this CefBrowser browser)
        {
            TaskStringVisitor taskStringVisitor = new TaskStringVisitor();
            browser.GetMainFrame().GetSource(taskStringVisitor);
            return taskStringVisitor.Task;
        }
    }
}

The relevant bit of code is here:

protected override async void OnLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode)
{
    if (frame.IsMain)
    {
        Html = await browser.GetSourceAsync();
        Console.WriteLine("END: {0}, {1}", browser.GetMainFrame().Url, httpStatusCode);
    }
}

This actually appears to work; you can examine the Html variable with the debugger, and there is an HTML page in there. The problem is, the Html variable does me no good in that callback method; it's buried three layers deep in a class hierarchy, and I need to return it in the method I'm trying to write without creating a Schroedinbug.

(attempting to get the result from that string Html property, including trying to view it with the Html visualizer in the debugger, appears to cause a deadlock, something I'd really like to avoid, especially since this code is going to run on a server).

How do I achieve my var result = browser.GetHtml(string url); safely and reliably?

Bonus question: Could the callback mechanisms in the above code be converted to Tasks using this technique? What would that look like?

Upvotes: 0

Views: 1499

Answers (1)

Dmitry Azaraev
Dmitry Azaraev

Reputation: 1103

Keep in mind that current CefGlue versions did not provide any of synchronization contexts, so most of time you should not use async/await in callbacks, unless you are sure what you do.

"Reliable" code should be async, because most of CEF calls are async (with or without callbacks provided). Async/await is greatly simplifies this task, so i'm assume that this question can be simplified to: "how to write GetSourceAsync method correctly?". This is also relied to your's bonus question, and simple answer is of course no, and this technique should be consider harmful, because without knowledge of underlying code is lead to different effects.

So, regardless to GetSourceAsync method, and especially TaskStringVisitor i'm only propose you never execute TaskCompletionSource's methods directly, because it executes continuations synchronously (in .NET 4.6 it is have option to execute continuations asynchronously, but i'm personally did not inspect how it is done in 4.6 internally). This is needed to free one of CEF thread as soon as possible. Otherwise eventually you can obtain big continuation tree, loop or wait, what is actually block browser's thread forever. Also, note, that this kind extensions are also harmful, because they had same problems described above - the only choice to deal with is to have true async continuation.

protected override void Visit(string value)
{
    System.Threading.Tasks.Task.Run(() => taskCompletionSource.SetResult(value));
}

Some CEF API are hybrid: they queue task to required thread if we already not on required thread, or execute synchronously. For this cases handling should be simplified, and it is better to avoid async stuff in that case. Again, just to avoid synchronous continuations, because them can lead to reentrancy problems and/or just your obtain unnecessary stack frames (with hope that only for short period of time, and code did not stuck somewhere).

One of easiest sample is, but it is also true for some other API calls:

internal static class CefTaskHelper
{
    public static Task RunAsync(CefThreadId threadId, Action action)
    {
        if (CefRuntime.CurrentlyOn(threadId))
        {
            action();
            return TaskHelpers.Completed();
        }
        else
        {
            var tcs = new TaskCompletionSource<FakeVoid>();
            StartNew(threadId, () =>
            {
                try
                {
                    action();
                    tcs.SetResultAsync(default(FakeVoid));
                }
                catch (Exception e)
                {
                    tcs.SetExceptionAsync(e);
                }
            });
            return tcs.Task;
        }
    }

    public static void StartNew(CefThreadId threadId, Action action)
    {
        CefRuntime.PostTask(threadId, new CefActionTask(action));
    }
}

UPDATE:

This actually appears to work; you can examine the Html variable with the debugger, and there is an HTML page in there. The problem is, the Html variable does me no good in that callback method; it's buried three layers deep in a class hierarchy, and I need to return it in the method I'm trying to write without creating a Schroedinbug.

You just need to implement CefLifeSpanHandler and then you can have direct access to CefBrowser once it will be created (it created asynchronously). There is exists CreateBrowserSync call, but is not preffered way.

PS: I'm on the way on CefGlue next generation, but right now nothing ready to use. Better async/await integration is planned. I'm personally use async/await stuff around it intensively, exactly at server side environment.

Upvotes: 1

Related Questions