python - 熵计算的奇怪结果

我正在尝试编写一个函数来正确计算给定数据集的熵。但是，我得到了非常奇怪的熵值。

我遵循的理解是所有熵计算必须介于 0 和 1 之间，但我始终得到高于 2 的值。

注意:我必须为此使用log base 2

有人可以解释为什么我会产生不正确的熵结果吗？我正在测试的数据集是 ecoli dataset from the UCI Machine Learning Repository

import numpy
import math


#################### DATA HANDLING LIBRARY ####################
def csv_to_array(file):
    # Open the file, and load it in delimiting on the ',' for a comma separated value file
    data = open(file, 'r')
    data = numpy.loadtxt(data, delimiter=',')

    # Loop through the data in the array
    for index in range(len(data)):
        # Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
        try:
            data[index] = [float(x) for x in data[index]]
        except Exception:
            data[index] = 0
        except ValueError:
            data[index] = 0

    # Return the now type-formatted data
    return data


# Function that utilizes the numpy library to randomize the dataset.
def randomize_data(csv):
    csv = numpy.random.shuffle(csv)
    return csv


# Function to split the data into test, training set, and validation sets
def split_data(csv):
    # Call the randomize data function
    randomize_data(csv)
    # Grab the number of rows and calculate where to split
    num_rows = csv.shape[0]
    validation_split = int(num_rows * 0.10)
    training_split = int(num_rows * 0.72)
    testing_split = int(num_rows * 0.18)

    # Validation set as the first 10% of the data
    validation_set = csv[:validation_split]
    # Training set as the next 72
    training_set = csv[validation_split:training_split + validation_split]
    # Testing set as the last 18
    testing_set = csv[training_split + validation_split:]
    # Split the data into classes vs actual data
    training_cols = training_set.shape[1]
    testing_cols = testing_set.shape[1]
    validation_cols = validation_set.shape[1]
    training_classes = training_set[:, training_cols - 1]
    testing_classes = testing_set[:, testing_cols - 1]
    validation_classes = validation_set[:, validation_cols - 1]

    # Take the sets and remove the last (classification) column
    training_set = training_set[:-1]
    testing_set = testing_set[:-1]
    validation_set = validation_set[:-1]

    # Return the datasets
    return testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes


#################### DATA HANDLING LIBRARY ####################

# This function returns the list of classes, and their associated weights (i.e. distributions)
# for a given dataset
def class_distribution(dataset):
    # Ensure the dataset is a numpy array
    dataset = numpy.asarray(dataset)
    # Collect # of total rows and columns, using numpy
    num_total_rows = dataset.shape[0]
    num_columns = dataset.shape[1]
    # Create a numpy array of just the classes
    classes = dataset[:, num_columns - 1]
    # Use numpy.unique to remove duplicates
    classes = numpy.unique(classes)
    # Create an empty array for the class weights
    class_weights = []

    # Loop through the classes one by one
    for aclass in classes:
        # Create storage variables
        total = 0
        weight = 0
        # Now loop through the dataset
        for row in dataset:
            # If the class of the dataset is equal to the current class you are evaluating, increase the total
            if numpy.array_equal(aclass, row[-1]):
                total = total + 1
            # If not, continue
            else:
                continue
        # Divide the # of occurences by total rows
        weight = float((total / num_total_rows))
        # Add that weight to the list of class weights
        class_weights.append(weight)

    # Turn the weights into a numpy array
    class_weights = numpy.asarray(class_weights)
    # Return the array
    return classes, class_weights

# This function returns the entropy for a given dataset
# Can be used across an entire csv, or just for a column of data (feature)
def get_entropy(dataset):
    # Set initial entropy
    entropy = 0.0
    # Determine the classes and their frequencies (weights) of the dataset
    classes, class_freq = class_distribution(dataset)
    # Utilize numpy's quicksort to test the most occurring class first
    numpy.sort(class_freq)
    # Determine the max entropy for the dataset
    max_entropy = math.log(len(classes), 2)
    print("MAX ENTROPY FOR THIS DATASET: ", max_entropy)
    # Loop through the frequencies and use given formula to calculate entropy
    # For...Each simulates the sequence operator
    for freq in class_freq:
        entropy += float(-freq * math.log(freq, 2))

    # Return the entropy value
    return entropy



def main():
    ecol = csv_to_array('ecoli.csv')
    testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes = split_data(ecol)

    entropy = get_entropy(ecol)
    print(entropy)

main()

最佳答案

以下函数用于计算熵:

# Function to return Shannon's Entropy
def entropy(attributes, dataset, targetAttr):
    freq = {}
    entropy = 0.0
    index = 0
    for item in attributes:
        if (targetAttr == item):
            break
        else:
            index = index + 1
    index = index - 1
    for item in dataset:
        if ((item[index]) in freq):
            # Increase the index
            freq[item[index]] += 1.0
        else:
            # Initialize it by setting it to 0
            freq[item[index]] = 1.0

    for freq in freq.values():
        entropy = entropy + (-freq / len(dataset)) * math.log(freq / len(dataset), 2)
    return entropy

正如@MattTimmermans 所指出的，熵的值实际上取决于类的数量。对于严格的 2 类，它包含在 0 到 1(含) 范围内。然而，对于超过 2 个类(正在测试的类)，熵是用不同的公式计算的(转换为上面的 Pythonic 代码)。 This post here更详细地解释了这些数学和计算。

关于python - 熵计算的奇怪结果，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/55290793/

python - 熵计算的奇怪结果

上一篇：javascript - 元素相对于另一个元素的非线性运动

下一篇：algorithm - 是否可以仅通过查看整个拼字游戏中玩过的牌来推断每个玩家拥有的牌？