OCR Images Using Microsoft Office 2003 SDK

Universal Document Converter is virtual printer software that saves any document you print as a raster PDF or an image file. You can use a post-print feature of Universal Document Converter to apply additional processing to every output file. The example below is just one of many post-print processing solutions.

// Microsoft Office Document Imaging Library (MODI) which is
// pupplied with the Office 2003 package, allows you easily integrate
// OCR functionality into your own applications. For example, you can
// use recognized text for indexing documents in your database.
// Important notice: MS Office 2000 or XP does not contain MODI!
//
// 1) Open your project in Microsoft Visual C++ 6.0
//
// 2) Press "Ctrl+W" in Visual C++ to open ClassWizard
//
// 3) In ClassWizard window press "Add Class->From a type library"
//    button and select "MDIVWCTL.DLL" file. By default this file
//    should be in folder:
//    "C:\Program Files\Common Files\Microsoft Shared\MODI\11.0"
//
// 4) Save files "mdivwctl.h" and "mdivwctl.cpp" and add into
//    your "stdafx.h" file this include: #include "mdivwctl.h"
//
// 5) You must initialize the COM before you call any COM method.
//    Please call "::CoInitialize(0);" before using COM and
//    ::CoUninitialize(); after using COM.

#include "mdivwctl.h"

enum MiLANGUAGES 
{ 
  miLANG_CHINESE_SIMPLIFIED = 2052,
  miLANG_CHINESE_TRADITIONAL = 1028,
  miLANG_CZECH = 5,
  miLANG_DANISH = 6, 
  miLANG_DUTCH = 19,
  miLANG_ENGLISH = 9,
  miLANG_FINNISH = 11,
  miLANG_FRENCH = 12,
  miLANG_GERMAN = 7,
  miLANG_GREEK = 8, 
  miLANG_HUNGARIAN = 14,
  miLANG_ITALIAN = 16,
  miLANG_JAPANESE = 17,
  miLANG_KOREAN = 18,
  miLANG_NORWEGIAN = 20,
  miLANG_POLISH = 21,
  miLANG_PORTUGUESE = 22,
  miLANG_RUSSIAN = 25,
  miLANG_SPANISH = 10,
  miLANG_SWEDISH = 29,
  miLANG_SYSDEFAULT = 2048,
  miLANG_TURKISH = 31
};

enum MiFILE_FORMAT
{
  miFILE_FORMAT_DEFAULTVALUE = -1,
  miFILE_FORMAT_TIFF = 1,
  miFILE_FORMAT_TIFF_LOSSLESS = 2, 
  miFILE_FORMAT_MDI = 4
};

enum MiCOMP_LEVEL 
{
  miCOMP_LEVEL_LOW = 0,
  miCOMP_LEVEL_MEDIUM = 1, 
  miCOMP_LEVEL_HIGH = 2
};

BOOL OCRImageFile( CString sImgFilePath, CString sOutFilePath )
{
  IDocument *pDoc = new IDocument;
  pDoc->CreateDispatch( "MODI.Document" );

  pDoc->Create( sImgFilePath );
  pDoc->OCR( miLANG_ENGLISH, 0, 0 );
////
  FILE    *fpOut = fopen( sOutFilePath, "wt" );
  IImages images = pDoc->GetImages();
  long	  num = images.GetCount();

  for( int i = 0; i < num; i++ )
  {
    IImage  image = images.GetItem(i);
    ILayout layout = image.GetLayout();

    fprintf( fpOut, " --< page %d of %d begin >--\n", i + 1, num );
    fprintf( fpOut, layout.GetText() );
    fprintf( fpOut, " --< page %d of %d end >--\n", i + 1, num );
  }
////
  pDoc->Close(0);

  pDoc->ReleaseDispatch();
  delete pDoc;

  fclose( fpOut );
////
  return (num > 0) ? TRUE : FALSE;
}


  • Marc Crames

    Marc Crames

    Student, Germany

    «The tool completely meets all of my expectations. It even allows me to precisely adjust the resolution of the created image file. In addition, as a pleasant side effect, the file is relatively immune to alteration.»