我想在 Jupyter Notebook 上使用 TensorFlow 2.8 处理文本。
我的代码:
import re
import string
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_text as tf_text
def standardize(input_data):
lowercase_str = tf.strings.lower(input_data)
a_str = tf.strings.regex_replace(lowercase_str, f"[{re.escape(string.punctuation)}]", "")
tokenizer = tf_text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(a_str)
return tokens
# the input data loaded from text files by TfRecordDataset(file_paths, "GZIP")
# each file can be 200+MB, totally about 300 files
# each file hold the data with multiple columns
# some columns are text
# after loading, the dataset will be accessed by column name
# e.g. one column is "sports", so the input_dataset["sports"]
# return a tensor, which is like the following example
my_data_tensor = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]])
tf.print(my_data_tensor)
tf.print(my_data_tensor.shape)
tf.print(f"type is {type(my_data_tensor)}")
text_layer = layers.TextVectorization(
standardize = standardize,
max_tokens = 10,
output_mode = 'int',
output_sequence_length=10
)
my_dataset = tf.data.Dataset.from_tensor_slices(my_data_tensor)
text_layer.adapt(my_dataset.batch(2)) # error
processed_text = text_layer(my_dataset)
error:
ValueError: Exception encountered when calling layer "query_tower" (type QueryTower).
When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(2, 1, None) with rank=3
我尝试过 tf.unstack() 和 tf.reshape、tf.unbatch,但都不起作用。 对于给定的示例:
[["SWIM 2008-07 Baseball"], ["Football"]]
我需要什么:
[["swim 200807 baseball"], ["football"]]
then
it will be encoded as int by the "text_layer"
这些数据 (bach_size=2) 将作为特征用于机器学习模型。
我做错了什么吗?谢谢
最佳答案
你可以尝试这样的事情:
import re
import string
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_text as tf_text
def standardize(input_data):
lowercase_str = tf.strings.lower(input_data)
a_str = tf.strings.regex_replace(lowercase_str, f"[{re.escape(string.punctuation)}]", "")
tokenizer = tf_text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(a_str)
return tokens
# the input data loaded from text files by TfRecordDataset(file_paths, "GZIP")
# each file can be 200+MB, totally about 300 files
# each file hold the data with multiple columns
# some columns are text
# after loading, the dataset will be accessed by column name
# e.g. one column is "sports", so the input_dataset["sports"]
# return a tensor, which is like the following example
my_data_tensor = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]])
tf.print(my_data_tensor)
tf.print(my_data_tensor.shape)
tf.print(f"type is {type(my_data_tensor)}")
text_layer = layers.TextVectorization(
standardize = standardize,
max_tokens = 10,
output_mode = 'int',
output_sequence_length=10
)
my_dataset = tf.data.Dataset.from_tensor_slices(my_data_tensor)
my_dataset = tf.data.Dataset.from_tensor_slices((tf.concat(list(my_dataset.map(lambda x: x)), axis=0)))
text_layer.adapt(my_dataset)
my_dataset = my_dataset.batch(2)
processed_text = my_dataset.map(lambda x: text_layer(tf.squeeze(x, axis=-1)))
for p in process_text:
print(p)
[["SWIM 2008-07 Baseball"]
["Football"]]
TensorShape([2, 1])
type is <class 'tensorflow.python.framework.ops.EagerTensor'>
(<tf.Tensor: shape=(2, 10), dtype=int64, numpy=
array([[2, 5, 6, 4, 0, 0, 0, 0, 0, 0],
[3, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>, <tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 0], dtype=int32)>)
关于tensorflow 2 TextVectorization过程张量和数据集错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/71714299/