Trích xuất văn bản từ tài liệu bằng API Tài liệu

Bạn có thể thấy việc chỉ trích xuất văn bản từ một tài liệu sẽ rất hữu ích. Điều này sẽ hữu ích nếu bạn đang truyền văn bản đến một dịch vụ API khác. Tất cả văn bản trong một tài liệu đều nằm trong các thẻ ở dạng dòng văn bản của các phần tử đoạn văn. Việc trích xuất tất cả văn bản trong một tài liệu bao gồm việc duyệt qua hệ phân cấp cây thẻ và gọi các phương thức getter từ TabDocumentTab. Hãy xem phần Làm việc với thẻ để biết thêm thông tin về tính năng thẻ.

Văn bản có thể xuất hiện trong 3 loại thành phần cấu trúc của thẻ tài liệu:

  • Đoạn
  • Mục lục
  • Bảng

Bạn có thể lồng các bảng vào trong một bảng khác. Do đó, để trích xuất tất cả văn bản trong một tài liệu, bạn phải truy cập vào từng phần tử cấu trúc lồng nhau.

Để biết nội dung mô tả đầy đủ về phần nội dung của tài liệu, hãy xem hướng dẫn về Cấu trúc tài liệu.

Mẫu API Google Tài liệu sau đây sử dụng tính năng đệ quy để truy cập vào từng phần tử cấu trúc trong tất cả các thẻ của một tài liệu và in văn bản.

Java

// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.extensions.java6.auth.oauth2.AuthorizationCodeInstalledApp;
import com.google.api.client.extensions.jetty.auth.oauth2.LocalServerReceiver;
import com.google.api.client.googleapis.auth.oauth2.GoogleAuthorizationCodeFlow;
import com.google.api.client.googleapis.auth.oauth2.GoogleClientSecrets;
import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.jackson2.JacksonFactory;
import com.google.api.client.util.store.FileDataStoreFactory;
import com.google.api.services.docs.v1.Docs;
import com.google.api.services.docs.v1.DocsScopes;
import com.google.api.services.docs.v1.model.Document;
import com.google.api.services.docs.v1.model.DocumentTab;
import com.google.api.services.docs.v1.model.ParagraphElement;
import com.google.api.services.docs.v1.model.StructuralElement;
import com.google.api.services.docs.v1.model.Tab;
import com.google.api.services.docs.v1.model.TableCell;
import com.google.api.services.docs.v1.model.TableRow;
import com.google.api.services.docs.v1.model.TextRun;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class ExtractText {
  private static final String APPLICATION_NAME = "Google Docs API Extract Guide";
  private static final JsonFactory JSON_FACTORY = JacksonFactory.getDefaultInstance();
  private static final String TOKENS_DIRECTORY_PATH = "tokens";
  private static final String DOCUMENT_ID = "YOUR_DOCUMENT_ID";

  /**
   * Global instance of the scopes required by this quickstart. If modifying these scopes, delete
   * your previously saved tokens/ folder.
   */
  private static final List<String> SCOPES =
      Collections.singletonList(DocsScopes.DOCUMENTS_READONLY);

  private static final String CREDENTIALS_FILE_PATH = "/credentials.json";

  /**
   * Creates an authorized Credential object.
   *
   * @param HTTP_TRANSPORT The network HTTP Transport.
   * @return An authorized Credential object.
   * @throws IOException If the credentials.json file cannot be found.
   */
  private static Credential getCredentials(final NetHttpTransport HTTP_TRANSPORT)
      throws IOException {
    // Load client secrets.
    InputStream in = ExtractText.class.getResourceAsStream(CREDENTIALS_FILE_PATH);
    GoogleClientSecrets clientSecrets =
        GoogleClientSecrets.load(JSON_FACTORY, new InputStreamReader(in));

    // Build flow and trigger user authorization request.
    GoogleAuthorizationCodeFlow flow =
        new GoogleAuthorizationCodeFlow.Builder(HTTP_TRANSPORT, JSON_FACTORY, clientSecrets, SCOPES)
            .setDataStoreFactory(new FileDataStoreFactory(new java.io.File(TOKENS_DIRECTORY_PATH)))
            .setAccessType("offline")
            .build();
    LocalServerReceiver receiver = new LocalServerReceiver.Builder().setPort(8888).build();
    return new AuthorizationCodeInstalledApp(flow, receiver).authorize("user");
  }

  /**
   * Adds the provided tab to the list of all tabs, and recurses through and
   * adds all child tabs.
   */
  private void addCurrentAndChildTabs(Tab tab, List<Tab> allTabs) {
    allTabs.add(tab);
    for (Tab tab: tab.getChildTabs()) {
      addCurrentAndChildTabs(tab, allTabs);
    }
  }

  /**
   * Returns a flat list of all tabs in the document in the order they would
   * appear in the UI (top-down ordering). Includes all child tabs.
   */
  private List<Tab> getAllTabs(Document doc) {
    List<Tab> allTabs = new ArrayList<>();
    // Iterate over all tabs and recursively add any child tabs to generate a
    // flat list of Tabs.
    for (Tab tab: doc.getTabs()) {
      addCurrentAndChildTabs(tab, allTabs);
    }
    return allTabs;
  }

  /**
   * Returns the text in the given ParagraphElement.
   *
   * @param element a ParagraphElement from a Google Doc
   */
  private static String readParagraphElement(ParagraphElement element) {
    TextRun run = element.getTextRun();
    if (run == null || run.getContent() == null) {
      // The TextRun can be null if there is an inline object.
      return "";
    }
    return run.getContent();
  }

  /**
   * Recurses through a list of Structural Elements to read a document's text where text may be in
   * nested elements.
   *
   * @param elements a list of Structural Elements
   */
  private static String readStructuralElements(List<StructuralElement> elements) {
    StringBuilder sb = new StringBuilder();
    for (StructuralElement element : elements) {
      if (element.getParagraph() != null) {
        for (ParagraphElement paragraphElement : element.getParagraph().getElements()) {
          sb.append(readParagraphElement(paragraphElement));
        }
      } else if (element.getTable() != null) {
        // The text in table cells are in nested Structural Elements and tables may be
        // nested.
        for (TableRow row : element.getTable().getTableRows()) {
          for (TableCell cell : row.getTableCells()) {
            sb.append(readStructuralElements(cell.getContent()));
          }
        }
      } else if (element.getTableOfContents() != null) {
        // The text in the TOC is also in a Structural Element.
        sb.append(readStructuralElements(element.getTableOfContents().getContent()));
      }
    }
    return sb.toString();
  }

  public static void main(String... args) throws IOException, GeneralSecurityException {
    // Build a new authorized API client service.
    final NetHttpTransport HTTP_TRANSPORT = GoogleNetHttpTransport.newTrustedTransport();
    Docs service =
        new Docs.Builder(HTTP_TRANSPORT, JSON_FACTORY, getCredentials(HTTP_TRANSPORT))
            .setApplicationName(APPLICATION_NAME)
            .build();

    // Fetch the document with all of the tabs populated, including any nested
    // child tabs.
    Document doc =
        service.documents().get(DOCUMENT_ID).setIncludeTabsContent(true).execute();
    List<Tab> allTabs = getAllTabs(doc);

    // Print the text from each tab in the document.
    for (Tab tab: allTabs) {
      // Get the DocumentTab from the generic Tab.
      DocumentTab documentTab = tab.getDocumentTab();
      System.out.println(
          readStructuralElements(documentTab.getBody().getContent()));
    }
  }
}

Python

# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Recursively extracts the text from a Google Doc.
"""
import googleapiclient.discovery as discovery
from httplib2 import Http
from oauth2client import client
from oauth2client import file
from oauth2client import tools

SCOPES = 'https://www.googleapis.com/auth/documents.readonly'
DISCOVERY_DOC = 'https://docs.googleapis.com/$discovery/rest?version=v1'
DOCUMENT_ID = 'YOUR_DOCUMENT_ID'


def get_credentials():
  """Gets valid user credentials from storage.

  If nothing has been stored, or if the stored credentials are invalid,
  the OAuth 2.0 flow is completed to obtain the new credentials.

  Returns:
      Credentials, the obtained credential.
  """
  store = file.Storage('token.json')
  credentials = store.get()

  if not credentials or credentials.invalid:
    flow = client.flow_from_clientsecrets('credentials.json', SCOPES)
    credentials = tools.run_flow(flow, store)
  return credentials


def add_current_and_child_tabs(tab, all_tabs):
  """Adds the provided tab to the list of all tabs, and recurses through and
  adds all child tabs.

  Args:
      tab: a Tab from a Google Doc.
      all_tabs: a list of all tabs in the document.
  """
  all_tabs.append(tab)
  for tab in tab.get('childTabs'):
    add_current_and_child_tabs(tab, all_tabs)


def get_all_tabs(doc):
  """Returns a flat list of all tabs in the document in the order they would
  appear in the UI (top-down ordering). Includes all child tabs.

  Args:
      doc: a document.
  """
  all_tabs = []
  # Iterate over all tabs and recursively add any child tabs to generate a
  # flat list of Tabs.
  for tab in doc.get('tabs'):
    add_current_and_child_tabs(tab, all_tabs)
  return all_tabs


def read_paragraph_element(element):
  """Returns the text in the given ParagraphElement.

  Args:
      element: a ParagraphElement from a Google Doc.
  """
  text_run = element.get('textRun')
  if not text_run:
    return ''
  return text_run.get('content')


def read_structural_elements(elements):
  """Recurses through a list of Structural Elements to read a document's text
  where text may be in nested elements.

  Args:
      elements: a list of Structural Elements.
  """
  text = ''
  for value in elements:
    if 'paragraph' in value:
      elements = value.get('paragraph').get('elements')
      for elem in elements:
        text += read_paragraph_element(elem)
    elif 'table' in value:
      # The text in table cells are in nested Structural Elements and tables may
      # be nested.
      table = value.get('table')
      for row in table.get('tableRows'):
        cells = row.get('tableCells')
        for cell in cells:
          text += read_structural_elements(cell.get('content'))
    elif 'tableOfContents' in value:
      # The text in the TOC is also in a Structural Element.
      toc = value.get('tableOfContents')
      text += read_structural_elements(toc.get('content'))
  return text


def main():
  """Uses the Docs API to print out the text of a document."""
  credentials = get_credentials()
  http = credentials.authorize(Http())
  docs_service = discovery.build(
      'docs', 'v1', http=http, discoveryServiceUrl=DISCOVERY_DOC
  )
  # Fetch the document with all of the tabs populated, including any nested
  # child tabs.
  doc = (
      docs_service.documents()
      .get(documentId=DOCUMENT_ID, include_tabs_content=True)
      .execute()
  )
  all_tabs = get_all_tabs(doc)

  # Print the text from each tab in the document.
  for tab in all_tabs:
    # Get the DocumentTab from the generic Tab.
    document_tab = tab.get('documentTab')
    doc_content = document_tab.get('body').get('content')
    print(read_structural_elements(doc_content))


if __name__ == '__main__':
  main()