add deleted files

belkhoujaons · belkhoujaons · commit 7a5973c129c3 · 2025-10-12T16:10:03.000+01:00
diff --git a/TEXT_SUMMARIZER/README.md b/TEXT_SUMMARIZER/README.md
@@ -0,0 +1,114 @@
+# Text Summarizer
+
+A Python script that summarizes text by keeping only the most important sentences using simple word frequency analysis.
+
+## Features
+
+- **Frequency-based Extractive Algorithm**: Analyzes word frequencies to identify important sentences
+- **Stop Words Filtering**: Removes common words to focus on meaningful content
+- **Normalized Scoring**: Scores sentences based on normalized word frequencies
+- **Maintains Original Order**: Preserves the original sequence of selected sentences
+- **Configurable Summary Ratio**: Adjust the proportion of sentences to include in the summary
+
+## Requirements
+
+- Python 3.x
+- No external dependencies (uses only standard library)
+
+## Installation
+
+No installation required. Simply download the script:
+
+```bash
+git clone https://github.com/sumanth-0/100LinesOfPythonCode.git
+cd 100LinesOfPythonCode/text_summarizer
+```
+
+## Usage
+
+### Interactive Mode
+
+Run the script and enter text when prompted:
+
+```bash
+python text_summarizer.py
+```
+
+### Programmatic Usage
+
+```python
+from text_summarizer import summarize
+
+text = """
+Your long text goes here. The text summarizer will analyze word frequencies
+and select the most important sentences. It uses an extractive approach,
+meaning it pulls sentences directly from the original text rather than
+generating new ones.
+"""
+
+# Summarize with default 30% ratio
+summary = summarize(text)
+print(summary)
+
+# Summarize with custom ratio (e.g., 50% of sentences)
+summary = summarize(text, ratio=0.5)
+print(summary)
+```
+
+## How It Works
+
+1. **Text Cleaning**: Removes extra whitespace and normalizes the input text
+2. **Sentence Tokenization**: Splits text into individual sentences
+3. **Word Tokenization**: Extracts words and converts them to lowercase
+4. **Frequency Analysis**: Calculates word frequencies while filtering stop words
+5. **Sentence Scoring**: Scores each sentence based on the frequency of words it contains
+6. **Sentence Selection**: Selects top-scoring sentences based on the specified ratio
+7. **Order Preservation**: Returns selected sentences in their original order
+
+## Algorithm Details
+
+### Stop Words Filtering
+
+The script filters out common English stop words like "the", "is", "at", "which", etc. This helps focus on content-bearing words.
+
+### Scoring Formula
+
+Each sentence is scored using:
+```
+sentence_score = sum(normalized_word_frequencies) / number_of_words
+```
+
+This approach favors sentences with high-frequency important words while normalizing for sentence length.
+
+## Example
+
+### Input
+```
+Artificial intelligence is transforming technology. Machine learning algorithms can process vast amounts of data. Neural networks are inspired by the human brain. Deep learning has revolutionized computer vision. Natural language processing enables computers to understand text. AI applications are everywhere in modern life.
+```
+
+### Output (30% ratio - 2 sentences)
+```
+Deep learning has revolutionized computer vision. Neural networks are inspired by the human brain.
+```
+
+## Limitations
+
+- Works best with structured, well-written text
+- May not capture context or semantic relationships
+- Limited stop word list (can be expanded)
+- No handling of complex linguistic structures
+- Best suited for informative/factual text rather than narrative content
+
+## Contributing
+
+Feel free to submit issues, fork the repository, and create pull requests for any improvements.
+
+## License
+
+This project is part of the 100LinesOfPythonCode repository. See the main repository for license information.
+
+## References
+
+- Issue: #682
+- Repository: [100LinesOfPythonCode](https://github.com/sumanth-0/100LinesOfPythonCode)
diff --git a/TEXT_SUMMARIZER/text_summarizer.py b/TEXT_SUMMARIZER/text_summarizer.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+Text Summarizer using Frequency-based Extractive Algorithm
+Summarizes text by keeping only the most important sentences.
+"""
+
+import re
+from collections import Counter
+from string import punctuation
+
+
+def clean_text(text):
+    """Remove extra whitespace and normalize text."""
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+
+
+def tokenize_sentences(text):
+    """Split text into sentences."""
+    sentences = re.split(r'[.!?]+', text)
+    return [s.strip() for s in sentences if s.strip()]
+
+
+def tokenize_words(text):
+    """Extract words and convert to lowercase."""
+    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
+    return words
+
+
+def get_word_frequencies(sentences):
+    """Calculate word frequency scores."""
+    all_words = []
+    for sentence in sentences:
+        all_words.extend(tokenize_words(sentence))
+    
+    # Remove common stop words
+    stop_words = {'the', 'is', 'at', 'which', 'on', 'a', 'an', 'and', 'or',
+                  'but', 'in', 'with', 'to', 'for', 'of', 'as', 'by', 'that',
+                  'this', 'it', 'from', 'be', 'are', 'was', 'were', 'been',
+                  'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
+                  'could', 'should', 'may', 'might', 'can'}
+    
+    filtered_words = [w for w in all_words if w not in stop_words]
+    
+    word_freq = Counter(filtered_words)
+    max_freq = max(word_freq.values()) if word_freq else 1
+    
+    # Normalize frequencies
+    for word in word_freq:
+        word_freq[word] = word_freq[word] / max_freq
+    
+    return word_freq
+
+
+def score_sentences(sentences, word_freq):
+    """Score each sentence based on word frequencies."""
+    sentence_scores = {}
+    
+    for sentence in sentences:
+        words = tokenize_words(sentence)
+        score = sum(word_freq.get(word, 0) for word in words)
+        
+        if len(words) > 0:
+            sentence_scores[sentence] = score / len(words)
+        else:
+            sentence_scores[sentence] = 0
+    
+    return sentence_scores
+
+
+def summarize(text, ratio=0.3):
+    """Summarize text by extracting top sentences.
+    
+    Args:
+        text: Input text to summarize
+        ratio: Proportion of sentences to keep (0.0 to 1.0)
+    
+    Returns:
+        Summarized text
+    """
+    text = clean_text(text)
+    sentences = tokenize_sentences(text)
+    
+    if len(sentences) <= 2:
+        return text
+    
+    word_freq = get_word_frequencies(sentences)
+    sentence_scores = score_sentences(sentences, word_freq)
+    
+    # Select top sentences
+    num_sentences = max(1, int(len(sentences) * ratio))
+    top_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:num_sentences]
+    
+    # Maintain original order
+    summary_sentences = sorted(top_sentences, key=lambda x: sentences.index(x[0]))
+    summary = '. '.join([s[0] for s in summary_sentences]) + '.'
+    
+    return summary
+
+
+if __name__ == "__main__":
+    print("Text Summarizer - Frequency-based Extractive Algorithm")
+    print("=" * 55)
+    text = input("\nEnter text to summarize:\n")
+    
+    if text.strip():
+        summary = summarize(text)
+        print("\n" + "=" * 55)
+        print("SUMMARY:")
+        print("=" * 55)
+        print(summary)
+    else:
+        print("No text provided!")