import numpy as np

# 初始資料; 每個“樣本”一個條目
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# 首先,構建資料中所有token的索引
token_index = {}
for sample in samples:
    # 通過`split`方法對樣本進行標記。實際使用時還會從樣本中刪除標點符號和特殊字元。
    for word in sample.split():
        if word not in token_index:
            # 為每個唯一單詞指定唯一索引
            # 不將索引0賦值給任何單詞
            token_index[word] = len(token_index) + 1

# 接下來,對樣本進行向量化
# 只考慮每個樣本中的第一個'max_length'字
max_length = 10

# 用於儲存結果
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1
# 字元級別的one-hot編碼
import string

samples = ['The cat sat on the mat.', 'The dog ate my homeword.']
characters = string.printable  # 所有可列印的ASCII字元
token_index = dict(zip(characters, range(1, len(characters) + 1)))

max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample[: max_length]):
        index = token_index.get(character)
        results[i, j, index] = 1
# Keras具有內建實用程式,用於從原始文字資料開始在單詞級別或字元級別執行單熱編碼文字。
# 這是實際使用的內容,因為它將處理許多重要的功能,例如從字串中刪除特殊字元,或者只接受資料集中的前N個最常用的單詞(避免處理的常見限制) 非常大的輸入向量空間)。

# 使用Keras進行字元級one-hot編碼
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# 建立一個tokenizer,配置為只考慮前1000個最常用的單詞
tokenizer = Tokenizer(num_words=1000)

# 構建單詞索引

# 可以直接獲得一個熱門的one-hot表示。
# 請注意,支援除one-hot編碼之外的其他向量化模式!
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

# 恢復計算的單詞索引的方法
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

one-hot編碼的變體是所謂的“one-hot hashing trick”,可以在詞彙表中的唯一標記數量太大而無法明確處理時使用。不是明確地為每個單詞分配索引並在字典
# 帶雜湊技巧的one-hot編碼
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
# 如果有接近1000個單詞(或更多),您將開始看到許多雜湊衝突,這將降低此編碼方法的準確性。
# 維度 = 1000
dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[: max_length]:
        # 將單詞雜湊到一個介於0和1000之間的“隨機”整數索引
        index = abs(hash(word)) % dimensionality
        results[i, j, index] = 1.



[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
D:\Users\Seavan_CC\Anaconda3\lib\site-packages\h5py\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
Found 9 unique tokens.
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]