widget/cocoa/TextRecognition.mm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#import <Vision/Vision.h>

#include "mozilla/dom/Promise.h"
#include "mozilla/gfx/2D.h"
#include "mozilla/ErrorResult.h"
#include "ErrorList.h"
#include "nsClipboard.h"
#include "nsCocoaUtils.h"
#include "mozilla/MacStringHelpers.h"
#include "mozilla/ScopeExit.h"
#include "mozilla/widget/TextRecognition.h"
#include "mozilla/dom/PContent.h"

namespace mozilla::widget {

auto TextRecognition::DoFindText(gfx::DataSourceSurface& aSurface,
                                 const nsTArray<nsCString>& aLanguages)
    -> RefPtr<NativePromise> {
  NS_OBJC_BEGIN_TRY_IGNORE_BLOCK

  // TODO - Is this the most efficient path? Maybe we can write a new
  // CreateCGImageFromXXX that enables more efficient marshalling of the data.
  CGImageRef imageRef = NULL;
  nsresult rv = nsCocoaUtils::CreateCGImageFromSurface(&aSurface, &imageRef);
  if (NS_FAILED(rv) || !imageRef) {
    return NativePromise::CreateAndReject("Failed to create CGImage"_ns,
                                          __func__);
  }

  auto promise = MakeRefPtr<NativePromise::Private>(__func__);

  NSMutableArray* recognitionLanguages = [[NSMutableArray alloc] init];
  for (const auto& locale : aLanguages) {
    [recognitionLanguages addObject:nsCocoaUtils::ToNSString(locale)];
  }

  NS_DispatchBackgroundTask(
      NS_NewRunnableFunction(
          __func__,
          [promise, imageRef, recognitionLanguages] {
            auto unrefImage = MakeScopeExit([&] {
              ::CGImageRelease(imageRef);
              [recognitionLanguages release];
            });

            dom::TextRecognitionResult result;
            dom::TextRecognitionResult* pResult = &result;

            // Define the request to use, which also handles the result. It will
            // be run below directly in this thread. After creating this
            // request.
            VNRecognizeTextRequest* textRecognitionRequest =
                [[VNRecognizeTextRequest alloc] initWithCompletionHandler:^(
                                                    VNRequest* _Nonnull request,
                                                    NSError* _Nullable error) {
                  NSArray<VNRecognizedTextObservation*>* observations =
                      request.results;

                  [observations enumerateObjectsUsingBlock:^(
                                    VNRecognizedTextObservation* _Nonnull obj,
                                    NSUInteger idx, BOOL* _Nonnull stop) {
                    // Requests the n top candidates for a recognized text
                    // string.
                    VNRecognizedText* recognizedText =
                        [obj topCandidates:1].firstObject;

                    // https://developer.apple.com/documentation/vision/vnrecognizedtext?language=objc
                    auto& quad = *pResult->quads().AppendElement();
                    CopyNSStringToXPCOMString(recognizedText.string,
                                              quad.string());
                    quad.confidence() = recognizedText.confidence;

                    auto ToImagePoint = [](CGPoint aPoint) -> ImagePoint {
                      return {static_cast<float>(aPoint.x),
                              static_cast<float>(aPoint.y)};
                    };
                    *quad.points().AppendElement() =
                        ToImagePoint(obj.bottomLeft);
                    *quad.points().AppendElement() = ToImagePoint(obj.topLeft);
                    *quad.points().AppendElement() = ToImagePoint(obj.topRight);
                    *quad.points().AppendElement() =
                        ToImagePoint(obj.bottomRight);
                  }];
                }];

            textRecognitionRequest.recognitionLevel =
                VNRequestTextRecognitionLevelAccurate;
            textRecognitionRequest.recognitionLanguages = recognitionLanguages;
            textRecognitionRequest.usesLanguageCorrection = true;

            // Send out the request. This blocks execution of this thread with
            // an expensive CPU call.
            NSError* error = nil;
            VNImageRequestHandler* requestHandler =
                [[[VNImageRequestHandler alloc] initWithCGImage:imageRef
                                                        options:@{}]
                    autorelease];

            [requestHandler performRequests:@[ textRecognitionRequest ]
                                      error:&error];
            if (error != nil) {
              promise->Reject(
                  nsPrintfCString(
                      "Failed to perform text recognition request (%ld)\n",
                      error.code),
                  __func__);
            } else {
              promise->Resolve(std::move(result), __func__);
            }
          }),
      NS_DISPATCH_EVENT_MAY_BLOCK);
  return promise;

  NS_OBJC_END_TRY_IGNORE_BLOCK
}

}  // namespace mozilla::widget